- Updated to 2.6.34-rc1.
authorJeff Mahoney <jeffm@suse.de>
Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)
committerJeff Mahoney <jeffm@suse.de>
Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)
  - Eliminated 36 patches.
  - Xen is disabled
  - Added new doc/config-options.changes to document configuration
    changes.

suse-commit: 79b0f88de85af29db971149723b397ef1209f413

298 files changed:
1  2 
Documentation/feature-removal-schedule.txt
Documentation/filesystems/Locking
Documentation/filesystems/proc.txt
Documentation/kernel-parameters.txt
Documentation/networking/ixgbevf.txt
MAINTAINERS
Makefile
arch/ia64/Kconfig
arch/ia64/Makefile
arch/ia64/kernel/Makefile
arch/ia64/kernel/acpi.c
arch/ia64/kernel/traps.c
arch/ia64/kvm/Kconfig
arch/powerpc/kernel/of_platform.c
arch/powerpc/platforms/52xx/mpc52xx_gpt.c
arch/powerpc/platforms/cell/interrupt.c
arch/powerpc/sysdev/fsl_msi.c
arch/powerpc/sysdev/ipic.c
arch/powerpc/xmon/xmon.c
arch/s390/Kconfig
arch/s390/Makefile
arch/s390/boot/Makefile
arch/x86/Kconfig
arch/x86/Kconfig.debug
arch/x86/Makefile
arch/x86/include/asm/apic.h
arch/x86/include/asm/fixmap.h
arch/x86/include/asm/io.h
arch/x86/include/asm/irq_vectors.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/pgalloc.h
arch/x86/include/asm/ptrace.h
arch/x86/include/asm/required-features.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/system.h
arch/x86/kdb/kdba_bt.c
arch/x86/kdb/kdba_support.c
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/apm_32.c
arch/x86/kernel/cpu/mcheck/Makefile
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/e820.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/hpet.c
arch/x86/kernel/machine_kexec_32.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/process.c
arch/x86/kernel/reboot.c
arch/x86/kernel/traps.c
arch/x86/kernel/tsc.c
arch/x86/kernel/x8664_ksyms_64.c
arch/x86/kvm/Kconfig
arch/x86/kvm/svm.c
arch/x86/kvm/x86.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/oprofile/nmi_int.c
arch/x86/xen/Kconfig
drivers/Makefile
drivers/acpi/numa.c
drivers/acpi/osl.c
drivers/ata/ahci.c
drivers/ata/ata_piix.c
drivers/ata/libata-core.c
drivers/atm/fore200e.c
drivers/block/Kconfig
drivers/block/Makefile
drivers/char/Kconfig
drivers/char/agp/intel-agp.c
drivers/char/keyboard.c
drivers/char/mem.c
drivers/crypto/amcc/crypto4xx_core.c
drivers/crypto/talitos.c
drivers/dma/fsldma.c
drivers/hid/hid-apple.c
drivers/hid/hid-core.c
drivers/hid/hid-ids.h
drivers/hid/usbhid/hid-core.c
drivers/ieee1394/sbp2.c
drivers/input/serio/xilinx_ps2.c
drivers/input/touchscreen/Kconfig
drivers/input/touchscreen/Makefile
drivers/macintosh/Kconfig
drivers/macintosh/adb.c
drivers/macintosh/therm_pm72.c
drivers/macintosh/therm_windtunnel.c
drivers/md/dm-mpath.c
drivers/md/dm-raid45.c
drivers/md/dm-table.c
drivers/md/dm.c
drivers/media/video/uvc/uvc_ctrl.c
drivers/media/video/uvc/uvc_driver.c
drivers/media/video/uvc/uvcvideo.h
drivers/mtd/maps/omap_nor.c
drivers/net/bnx2.c
drivers/net/e1000/e1000_main.c
drivers/net/e1000e/e1000.h
drivers/net/e1000e/netdev.c
drivers/net/ehea/ehea_main.c
drivers/net/fs_enet/fs_enet-main.c
drivers/net/fs_enet/mii-fec.c
drivers/net/gianfar.c
drivers/net/ibm_newemac/core.c
drivers/net/igb/igb_main.c
drivers/net/ixgbe/ixgbe_main.c
drivers/net/myri_sbus.c
drivers/net/niu.c
drivers/net/sky2.c
drivers/net/sunbmac.c
drivers/net/sunhme.c
drivers/net/sunlance.c
drivers/net/sunqe.c
drivers/net/tehuti.c
drivers/net/tg3.c
drivers/net/tulip/tulip_core.c
drivers/net/ucc_geth.c
drivers/net/wireless/Kconfig
drivers/pcmcia/electra_cf.c
drivers/pcmcia/m8xx_pcmcia.c
drivers/scsi/hpsa.c
drivers/scsi/ibmvscsi/ibmvscsi.c
drivers/scsi/lpfc/lpfc_hw.h
drivers/scsi/lpfc/lpfc_init.c
drivers/scsi/qla4xxx/ql4_init.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_scan.c
drivers/scsi/scsi_sysfs.c
drivers/scsi/sd.c
drivers/serial/8250.c
drivers/serial/mpc52xx_uart.c
drivers/spi/spi_mpc8xxx.c
drivers/usb/core/hcd.c
drivers/usb/core/hcd.h
drivers/usb/gadget/fsl_qe_udc.c
drivers/usb/host/ehci-hcd.c
drivers/usb/host/fhci-hcd.c
drivers/usb/host/isp1760-if.c
drivers/usb/host/ohci-hcd.c
drivers/usb/host/uhci-hcd.c
drivers/video/Kconfig
drivers/video/console/fbcon.c
drivers/video/console/vgacon.c
drivers/watchdog/cpwd.c
drivers/watchdog/riowd.c
fs/Kconfig
fs/Makefile
fs/bio.c
fs/compat_ioctl.c
fs/dlm/dlm_internal.h
fs/exec.c
fs/ext3/file.c
fs/ext3/ialloc.c
fs/ext3/inode.c
fs/ext3/namei.c
fs/ext3/super.c
fs/ext3/xattr.c
fs/gfs2/ops_fstype.c
fs/namei.c
fs/nfs/Kconfig
fs/nfs/dns_resolve.c
fs/nfs/file.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/write.c
fs/nfsd/vfs.c
fs/ocfs2/Makefile
fs/ocfs2/aops.c
fs/ocfs2/cluster/masklog.c
fs/ocfs2/cluster/masklog.h
fs/ocfs2/dir.c
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/localalloc.c
fs/ocfs2/ocfs2.h
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/partitions/check.c
fs/proc/array.c
fs/proc/base.c
fs/reiserfs/super.c
fs/super.c
fs/xfs/Makefile
fs/xfs/dmapi/xfs_dm.c
fs/xfs/linux-2.6/xfs_file.c
fs/xfs/linux-2.6/xfs_iops.c
fs/xfs/linux-2.6/xfs_iops.h
fs/xfs/linux-2.6/xfs_linux.h
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_rw.c
fs/xfs/xfs_rw.h
fs/xfs/xfs_vnodeops.c
include/linux/audit.h
include/linux/blkdev.h
include/linux/device.h
include/linux/ext3_fs.h
include/linux/ext3_fs_i.h
include/linux/fb.h
include/linux/fs.h
include/linux/genhd.h
include/linux/gfp.h
include/linux/irq.h
include/linux/jbd.h
include/linux/kernel.h
include/linux/libata.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/module.h
include/linux/nfs_fs.h
include/linux/page-flags.h
include/linux/sched.h
include/linux/security.h
include/linux/skbuff.h
include/linux/slab.h
include/linux/slub_def.h
include/linux/swap.h
include/net/netns/ipv6.h
include/net/sock.h
include/scsi/scsi_device.h
init/Kconfig
init/main.c
ipc/mqueue.c
kdb/modules/kdbm_vm.c
kernel/Kconfig.preempt
kernel/Makefile
kernel/capability.c
kernel/cgroup.c
kernel/exit.c
kernel/irq/chip.c
kernel/irq/handle.c
kernel/irq/spurious.c
kernel/kexec.c
kernel/ksysfs.c
kernel/module.c
kernel/panic.c
kernel/posix-cpu-timers.c
kernel/printk.c
kernel/ptrace.c
kernel/sched.c
kernel/signal.c
kernel/sys.c
kernel/sysctl.c
kernel/sysctl_binary.c
kernel/taskstats.c
lib/Kconfig.debug
mm/Makefile
mm/filemap.c
mm/hugetlb.c
mm/memcontrol.c
mm/memory.c
mm/migrate.c
mm/mmap.c
mm/page_alloc.c
mm/page_io.c
mm/slab.c
mm/slub.c
mm/swapfile.c
mm/vmscan.c
mm/vmstat.c
net/bridge/br_if.c
net/core/dev.c
net/core/filter.c
net/core/skbuff.c
net/core/sock.c
net/ipv4/ip_fragment.c
net/ipv4/route.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_output.c
net/ipv6/addrconf.c
net/ipv6/reassembly.c
net/ipv6/route.c
net/ipv6/tcp_ipv6.c
net/mac80211/Kconfig
net/netfilter/Kconfig
net/netfilter/Makefile
net/netfilter/nf_conntrack_netlink.c
net/sunrpc/svc_xprt.c
net/sunrpc/svcauth_unix.c
net/sunrpc/xprtsock.c
scripts/Makefile.build
scripts/kconfig/Makefile
security/apparmor/path.c
security/capability.c
security/security.c
security/selinux/avc.c
security/selinux/hooks.c
virt/kvm/ioapic.c

Simple merge
Simple merge
Simple merge
index 0000000,19015de..19015de
mode 000000,100755..100644
--- /dev/null
diff --cc MAINTAINERS
Simple merge
diff --cc Makefile
Simple merge
Simple merge
@@@ -56,9 -55,8 +55,9 @@@ core-$(CONFIG_IA64_XEN_GUEST) += arch/i
  core-$(CONFIG_IA64_SGI_SN2)   += arch/ia64/sn/
  core-$(CONFIG_IA64_SGI_UV)    += arch/ia64/uv/
  core-$(CONFIG_KVM)            += arch/ia64/kvm/
- core-$(CONFIG_PARAVIRT_XEN)   += arch/ia64/xen/
+ core-$(CONFIG_XEN)            += arch/ia64/xen/
  
 +drivers-$(CONFIG_KDB)         += arch/$(ARCH)/kdb/
  drivers-$(CONFIG_PCI)         += arch/ia64/pci/
  drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/
  drivers-$(CONFIG_IA64_HP_ZX1) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -99,12 -105,12 +105,12 @@@ drivers-$(CONFIG_OPROFILE)       += arch/s390
  
  boot          := arch/s390/boot
  
- all: image kerntypes.o
 -all: image bzImage
++all: image bzImage kerntypes.o
  
  install: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $@
  
- image kerntypes.o: vmlinux
 -image bzImage: vmlinux
++image bzImage kerntypes.o: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
  
  zfcpdump:
@@@ -2,27 -2,25 +2,36 @@@
  # Makefile for the linux s390-specific parts of the memory manager.
  #
  
 -COMPILE_VERSION := __linux_compile_version_id__`hostname |  \
 -                      tr -c '[0-9A-Za-z]' '_'`__`date | \
 -                      tr -c '[0-9A-Za-z]' '_'`_t
 +COMPILE_VERSION := __linux_compile_version_id__$(shell hostname |  \
 +                      tr -c '[0-9A-Za-z]' '_')__$(shell date | \
 +                      tr -c '[0-9A-Za-z]' '_')_t
  
 +
 +chk-option = $(shell if $(CC) $(CFLAGS) $(1) -S -o /dev/null -xc /dev/null \
 +           > /dev/null 2>&1; then echo "$(1)"; fi ;)
 +
 +# Remove possible '-g' from CFLAGS_KERNEL, since we want to use stabs
 +# debug format.
 +override CFLAGS_KERNEL := $(shell echo $(CFLAGS_KERNEL) | sed 's/-g//')
  EXTRA_CFLAGS  := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I.
 +# Assume we don't need the flag if the compiler doesn't know about it
 +EXTRA_CFLAGS  += $(call chk-option,-fno-eliminate-unused-debug-types)
 +
  
- targets := image kerntypes.o
+ targets := image
+ targets += bzImage
+ subdir- := compressed
++targets += kerntypes.o
  
  $(obj)/image: vmlinux FORCE
        $(call if_changed,objcopy)
  
+ $(obj)/bzImage: $(obj)/compressed/vmlinux FORCE
+       $(call if_changed,objcopy)
+ $(obj)/compressed/vmlinux: FORCE
+       $(Q)$(MAKE) $(build)=$(obj)/compressed $@
  install: $(CONFIGURE) $(obj)/image
        sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/image \
 -            System.map Kerntypes "$(INSTALL_PATH)"
 +            System.map "$(INSTALL_PATH)"
Simple merge
@@@ -313,91 -310,4 +310,91 @@@ config DEBUG_STRICT_USER_COPY_CHECK
  
          If unsure, or if you run an older (pre 4.4) gcc, say N.
  
 +config KDB
 +      bool "Built-in Kernel Debugger support"
-       depends on DEBUG_KERNEL && !XEN
++      depends on DEBUG_KERNEL
 +      select KALLSYMS
 +      select KALLSYMS_ALL
 +      help
 +        This option provides a built-in kernel debugger.  The built-in
 +        kernel debugger contains commands which allow memory to be examined,
 +        instructions to be disassembled and breakpoints to be set.  For details,
 +        see Documentation/kdb/kdb.mm and the manual pages kdb_bt, kdb_ss, etc.
 +        Kdb can also be used via the serial port.  Set up the system to
 +        have a serial console (see Documentation/serial-console.txt).
 +        The key sequence <escape>KDB on the serial port will cause the
 +        kernel debugger to be entered with input from the serial port and
 +        output to the serial console.  If unsure, say N.
 +
 +config KDB_MODULES
 +      tristate "KDB modules"
 +      depends on KDB
 +      help
 +        KDB can be extended by adding your own modules, in directory
 +        kdb/modules.  This option selects the way that these modules should
 +        be compiled, as free standing modules (select M) or built into the
 +        kernel (select Y).  If unsure say M.
 +
 +config KDB_OFF
 +      bool "KDB off by default"
 +      depends on KDB
 +      help
 +        Normally kdb is activated by default, as long as CONFIG_KDB is set.
 +        If you want to ship a kernel with kdb support but only have kdb
 +        turned on when the user requests it then select this option.  When
 +        compiled with CONFIG_KDB_OFF, kdb ignores all events unless you boot
 +        with kdb=on or you echo "1" > /proc/sys/kernel/kdb.  This option also
 +        works in reverse, if kdb is normally activated, you can boot with
 +        kdb=off or echo "0" > /proc/sys/kernel/kdb to deactivate kdb. If
 +        unsure, say N.
 +
 +config KDB_CONTINUE_CATASTROPHIC
 +      int "KDB continues after catastrophic errors"
 +      depends on KDB
 +      default "0"
 +      help
 +        This integer controls the behaviour of kdb when the kernel gets a
 +        catastrophic error, i.e. for a panic, oops, NMI or other watchdog
 +        tripping.  CONFIG_KDB_CONTINUE_CATASTROPHIC interacts with
 +        /proc/sys/kernel/kdb and CONFIG_LKCD_DUMP (if your kernel has the
 +        LKCD patch).
 +        When KDB is active (/proc/sys/kernel/kdb == 1) and a catastrophic
 +        error occurs, nothing extra happens until you type 'go'.
 +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 0 (default).  The first time
 +        you type 'go', kdb warns you.  The second time you type 'go', KDB
 +        tries to continue - no guarantees that the kernel is still usable.
 +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 1.  KDB tries to continue - no
 +        guarantees that the kernel is still usable.
 +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 2.  If your kernel has the LKCD
 +        patch and LKCD is configured to take a dump then KDB forces a dump.
 +        Whether or not a dump is taken, KDB forces a reboot.
 +        When KDB is not active (/proc/sys/kernel/kdb == 0) and a catastrophic
 +        error occurs, the following steps are automatic, no human
 +        intervention is required.
 +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 0 (default) or 1.  KDB attempts
 +        to continue - no guarantees that the kernel is still usable.
 +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 2.  If your kernel has the LKCD
 +        patch and LKCD is configured to take a dump then KDB automatically
 +        forces a dump.  Whether or not a dump is taken, KDB forces a
 +        reboot.
 +        If you are not sure, say 0.  Read Documentation/kdb/dump.txt before
 +        setting to 2.
 +
 +config KDB_USB
 +      bool "Support for USB Keyboard in KDB"
 +      depends on KDB && (USB_OHCI_HCD || USB_EHCI_HCD || USB_UHCI_HCD)
 +      help
 +        If you want to use kdb from USB keyboards then say Y here.  If you
 +        say N then kdb can only be used from a PC (AT) keyboard or a serial
 +        console.
 +
 +config KDB_KDUMP
 +      bool "Support for Kdump in KDB"
 +      depends on KDB
 +      select KEXEC
 +      default N
 +      help
 +        If you want to take Kdump kernel vmcore from KDB then say Y here.
 +        If unsure, say N.
 +
  endmenu
@@@ -137,13 -135,8 +137,11 @@@ drivers-$(CONFIG_OPROFILE) += arch/x86/
  # suspend and hibernation support
  drivers-$(CONFIG_PM) += arch/x86/power/
  
- ifeq ($(CONFIG_X86_32),y)
  drivers-$(CONFIG_FB) += arch/x86/video/
- endif
  
 +# KDB support
 +drivers-$(CONFIG_KDB) += arch/x86/kdb/
 +
  ####
  # boot loader support. Several targets are kept for legacy purposes
  
Simple merge
Simple merge
@@@ -173,11 -207,126 +207,126 @@@ static inline void __iomem *ioremap(res
  extern void iounmap(volatile void __iomem *addr);
  
  
- #ifdef CONFIG_X86_32
- # include "io_32.h"
+ #ifdef __KERNEL__
+ #include <asm-generic/iomap.h>
+ #include <linux/vmalloc.h>
+ /*
+  * Convert a virtual cached pointer to an uncached pointer
+  */
+ #define xlate_dev_kmem_ptr(p) p
+ static inline void
+ memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
+ {
+       memset((void __force *)addr, val, count);
+ }
+ static inline void
+ memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
+ {
+       memcpy(dst, (const void __force *)src, count);
+ }
+ static inline void
+ memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
+ {
+       memcpy((void __force *)dst, src, count);
+ }
+ /*
+  * ISA space is 'always mapped' on a typical x86 system, no need to
+  * explicitly ioremap() it. The fact that the ISA IO space is mapped
+  * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+  * are physical addresses. The following constant pointer can be
+  * used as the IO-area pointer (it can be iounmapped as well, so the
+  * analogy with PCI is quite large):
+  */
+ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+ /*
+  *    Cache management
+  *
+  *    This needed for two cases
+  *    1. Out of order aware processors
+  *    2. Accidentally out of order processors (PPro errata #51)
+  */
+ static inline void flush_write_buffers(void)
+ {
+ #if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+       asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+ #endif
+ }
+ #endif /* __KERNEL__ */
+ extern void native_io_delay(void);
+ extern int io_delay_type;
+ extern void io_delay_init(void);
 -#if defined(CONFIG_PARAVIRT)
++#if defined(CONFIG_PARAVIRT_CPU)
+ #include <asm/paravirt.h>
  #else
- # include "io_64.h"
+ static inline void slow_down_io(void)
+ {
+       native_io_delay();
+ #ifdef REALLY_SLOW_IO
+       native_io_delay();
+       native_io_delay();
+       native_io_delay();
  #endif
+ }
+ #endif
+ #define BUILDIO(bwl, bw, type)                                                \
+ static inline void out##bwl(unsigned type value, int port)            \
+ {                                                                     \
+       asm volatile("out" #bwl " %" #bw "0, %w1"                       \
+                    : : "a"(value), "Nd"(port));                       \
+ }                                                                     \
+                                                                       \
+ static inline unsigned type in##bwl(int port)                         \
+ {                                                                     \
+       unsigned type value;                                            \
+       asm volatile("in" #bwl " %w1, %" #bw "0"                        \
+                    : "=a"(value) : "Nd"(port));                       \
+       return value;                                                   \
+ }                                                                     \
+                                                                       \
+ static inline void out##bwl##_p(unsigned type value, int port)                \
+ {                                                                     \
+       out##bwl(value, port);                                          \
+       slow_down_io();                                                 \
+ }                                                                     \
+                                                                       \
+ static inline unsigned type in##bwl##_p(int port)                     \
+ {                                                                     \
+       unsigned type value = in##bwl(port);                            \
+       slow_down_io();                                                 \
+       return value;                                                   \
+ }                                                                     \
+                                                                       \
+ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+ {                                                                     \
+       asm volatile("rep; outs" #bwl                                   \
+                    : "+S"(addr), "+c"(count) : "d"(port));            \
+ }                                                                     \
+                                                                       \
+ static inline void ins##bwl(int port, void *addr, unsigned long count)        \
+ {                                                                     \
+       asm volatile("rep; ins" #bwl                                    \
+                    : "+D"(addr), "+c"(count) : "d"(port));            \
+ }
+ BUILDIO(b, b, char)
+ BUILDIO(w, w, short)
+ BUILDIO(l, , int)
  
  extern void *xlate_dev_mem_ptr(unsigned long phys);
  extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
   */
  #define IRQ_MOVE_CLEANUP_VECTOR               FIRST_EXTERNAL_VECTOR
  
+ #define IA32_SYSCALL_VECTOR           0x80
+ #ifdef CONFIG_X86_32
+ # define SYSCALL_VECTOR                       0x80
+ #endif
++#define KDBENTER_VECTOR       0x81
  /*
   * Vectors 0x30-0x3f are used for ISA interrupts.
+  *   round up to the next 16-vector boundary
   */
- #define IRQ0_VECTOR                   (FIRST_EXTERNAL_VECTOR + 0x10)
+ #define IRQ0_VECTOR                   ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
  
  #define IRQ1_VECTOR                   (IRQ0_VECTOR +  1)
  #define IRQ2_VECTOR                   (IRQ0_VECTOR +  2)
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -48,7 -48,7 +48,7 @@@
  #endif
  
  #ifdef CONFIG_X86_64
- #if defined(CONFIG_PARAVIRT_MMU) || defined(CONFIG_XEN)
 -#ifdef CONFIG_PARAVIRT
++#ifdef CONFIG_PARAVIRT_MMU
  /* Paravirtualized systems may not have PSE or PGE available */
  #define NEED_PSE      0
  #define NEED_PGE      0
Simple merge
Simple merge
index 136268c,0000000..50d2806
mode 100644,000000..100644
--- /dev/null
@@@ -1,5760 -1,0 +1,5757 @@@
 +/*
 + * This file is subject to the terms and conditions of the GNU General Public
 + * License.  See the file "COPYING" in the main directory of this archive
 + * for more details.
 + *
 + * Copyright (c) 2006, 2007-2009 Silicon Graphics, Inc.  All Rights Reserved.
 + *
 + * Common code for doing accurate backtraces on i386 and x86_64, including
 + * printing the values of arguments.
 + */
 +
 +#include <linux/init.h>
 +#include <linux/kallsyms.h>
 +#include <linux/kdb.h>
 +#include <linux/kdbprivate.h>
 +#include <linux/ctype.h>
 +#include <linux/string.h>
 +#include <linux/stringify.h>
 +#include <linux/kernel.h>
 +#include <linux/sched.h>
 +#include <linux/nmi.h>
 +#include <asm/asm-offsets.h>
 +#include <asm/system.h>
 +
 +#define KDB_DEBUG_BB(fmt, ...)                                                        \
 +      {if (KDB_DEBUG(BB)) kdb_printf(fmt, ## __VA_ARGS__);}
 +#define KDB_DEBUG_BB_OFFSET_PRINTF(offset, prefix, suffix)                    \
 +      kdb_printf(prefix "%c0x%x" suffix,                                      \
 +                 offset >= 0 ? '+' : '-',                                     \
 +                 offset >= 0 ? offset : -offset)
 +#define KDB_DEBUG_BB_OFFSET(offset, prefix, suffix)                           \
 +      {if (KDB_DEBUG(BB)) KDB_DEBUG_BB_OFFSET_PRINTF(offset, prefix, suffix);}
 +
 +#define       BB_CHECK(expr, val, ret)                                                \
 +({                                                                            \
 +      if (unlikely(expr)) {                                                   \
 +              kdb_printf("%s, line %d: BB_CHECK(" #expr ") failed "           \
 +                      #val "=%lx\n",                                          \
 +                      __FUNCTION__, __LINE__, (long)val);                     \
 +              bb_giveup = 1;                                                  \
 +              return ret;                                                     \
 +      }                                                                       \
 +})
 +
 +static int bb_giveup;
 +
 +/* Use BBRG_Rxx for both i386 and x86_64.  RAX through R15 must be at the end,
 + * starting with RAX.  Some of these codes do not reflect actual registers,
 + * such codes are special cases when parsing the record of register changes.
 + * When updating BBRG_ entries, update bbrg_name as well.
 + */
 +
 +enum bb_reg_code
 +{
 +      BBRG_UNDEFINED = 0,     /* Register contents are undefined */
 +      BBRG_OSP,               /* original stack pointer on entry to function */
 +      BBRG_RAX,
 +      BBRG_RBX,
 +      BBRG_RCX,
 +      BBRG_RDX,
 +      BBRG_RDI,
 +      BBRG_RSI,
 +      BBRG_RBP,
 +      BBRG_RSP,
 +      BBRG_R8,
 +      BBRG_R9,
 +      BBRG_R10,
 +      BBRG_R11,
 +      BBRG_R12,
 +      BBRG_R13,
 +      BBRG_R14,
 +      BBRG_R15,
 +};
 +
 +const static char *bbrg_name[] = {
 +      [BBRG_UNDEFINED]   = "undefined",
 +      [BBRG_OSP]         = "osp",
 +      [BBRG_RAX]         = "rax",
 +      [BBRG_RBX]         = "rbx",
 +      [BBRG_RCX]         = "rcx",
 +      [BBRG_RDX]         = "rdx",
 +      [BBRG_RDI]         = "rdi",
 +      [BBRG_RSI]         = "rsi",
 +      [BBRG_RBP]         = "rbp",
 +      [BBRG_RSP]         = "rsp",
 +      [BBRG_R8]          = "r8",
 +      [BBRG_R9]          = "r9",
 +      [BBRG_R10]         = "r10",
 +      [BBRG_R11]         = "r11",
 +      [BBRG_R12]         = "r12",
 +      [BBRG_R13]         = "r13",
 +      [BBRG_R14]         = "r14",
 +      [BBRG_R15]         = "r15",
 +};
 +
 +/* Map a register name to its register code.  This includes the sub-register
 + * addressable fields, e.g. parts of rax can be addressed as ax, al, ah, eax.
 + * The list is sorted so it can be binary chopped, sort command is:
 + *   LANG=C sort -t '"' -k2
 + */
 +
 +struct bb_reg_code_map {
 +      enum bb_reg_code reg;
 +      const char *name;
 +};
 +
 +const static struct bb_reg_code_map
 +bb_reg_code_map[] = {
 +      { BBRG_RAX, "ah" },
 +      { BBRG_RAX, "al" },
 +      { BBRG_RAX, "ax" },
 +      { BBRG_RBX, "bh" },
 +      { BBRG_RBX, "bl" },
 +      { BBRG_RBP, "bp" },
 +      { BBRG_RBP, "bpl" },
 +      { BBRG_RBX, "bx" },
 +      { BBRG_RCX, "ch" },
 +      { BBRG_RCX, "cl" },
 +      { BBRG_RCX, "cx" },
 +      { BBRG_RDX, "dh" },
 +      { BBRG_RDI, "di" },
 +      { BBRG_RDI, "dil" },
 +      { BBRG_RDX, "dl" },
 +      { BBRG_RDX, "dx" },
 +      { BBRG_RAX, "eax" },
 +      { BBRG_RBP, "ebp" },
 +      { BBRG_RBX, "ebx" },
 +      { BBRG_RCX, "ecx" },
 +      { BBRG_RDI, "edi" },
 +      { BBRG_RDX, "edx" },
 +      { BBRG_RSI, "esi" },
 +      { BBRG_RSP, "esp" },
 +      { BBRG_R10, "r10" },
 +      { BBRG_R10, "r10d" },
 +      { BBRG_R10, "r10l" },
 +      { BBRG_R10, "r10w" },
 +      { BBRG_R11, "r11" },
 +      { BBRG_R11, "r11d" },
 +      { BBRG_R11, "r11l" },
 +      { BBRG_R11, "r11w" },
 +      { BBRG_R12, "r12" },
 +      { BBRG_R12, "r12d" },
 +      { BBRG_R12, "r12l" },
 +      { BBRG_R12, "r12w" },
 +      { BBRG_R13, "r13" },
 +      { BBRG_R13, "r13d" },
 +      { BBRG_R13, "r13l" },
 +      { BBRG_R13, "r13w" },
 +      { BBRG_R14, "r14" },
 +      { BBRG_R14, "r14d" },
 +      { BBRG_R14, "r14l" },
 +      { BBRG_R14, "r14w" },
 +      { BBRG_R15, "r15" },
 +      { BBRG_R15, "r15d" },
 +      { BBRG_R15, "r15l" },
 +      { BBRG_R15, "r15w" },
 +      { BBRG_R8,  "r8" },
 +      { BBRG_R8,  "r8d" },
 +      { BBRG_R8,  "r8l" },
 +      { BBRG_R8,  "r8w" },
 +      { BBRG_R9,  "r9" },
 +      { BBRG_R9,  "r9d" },
 +      { BBRG_R9,  "r9l" },
 +      { BBRG_R9,  "r9w" },
 +      { BBRG_RAX, "rax" },
 +      { BBRG_RBP, "rbp" },
 +      { BBRG_RBX, "rbx" },
 +      { BBRG_RCX, "rcx" },
 +      { BBRG_RDI, "rdi" },
 +      { BBRG_RDX, "rdx" },
 +      { BBRG_RSI, "rsi" },
 +      { BBRG_RSP, "rsp" },
 +      { BBRG_RSI, "si" },
 +      { BBRG_RSI, "sil" },
 +      { BBRG_RSP, "sp" },
 +      { BBRG_RSP, "spl" },
 +};
 +
 +/* Record register contents in terms of the values that were passed to this
 + * function, IOW track which registers contain an input value.  A register's
 + * contents can be undefined, it can contain an input register value or it can
 + * contain an offset from the original stack pointer.
 + *
 + * This structure is used to represent the current contents of the integer
 + * registers, it is held in an array that is indexed by BBRG_xxx.  The element
 + * for BBRG_xxx indicates what input value is currently in BBRG_xxx.  When
 + * 'value' is BBRG_OSP then register BBRG_xxx contains a stack pointer,
 + * pointing at 'offset' from the original stack pointer on entry to the
 + * function.  When 'value' is not BBRG_OSP then element BBRG_xxx contains the
 + * original contents of an input register and offset is ignored.
 + *
 + * An input register 'value' can be stored in more than one register and/or in
 + * more than one memory location.
 + */
 +
 +struct bb_reg_contains
 +{
 +      enum bb_reg_code value: 8;
 +      short offset;
 +};
 +
 +/* Note: the offsets in struct bb_mem_contains in this code are _NOT_ offsets
 + * from OSP, they are offsets from current RSP.  It fits better with the way
 + * that struct pt_regs is built, some code pushes extra data before pt_regs so
 + * working with OSP relative offsets gets messy.  struct bb_mem_contains
 + * entries must be in descending order of RSP offset.
 + */
 +
 +typedef struct { DECLARE_BITMAP(bits, BBRG_R15+1); } bbrgmask_t;
 +#define BB_SKIP(reg) (1 << (BBRG_ ## reg))
 +struct bb_mem_contains {
 +      short offset_address;
 +      enum bb_reg_code value: 8;
 +};
 +
 +/* Transfer of control to a label outside the current function.  If the
 + * transfer is to a known common restore path that expects known registers
 + * and/or a known memory state (e.g. struct pt_regs) then do a sanity check on
 + * the state at this point.
 + */
 +
 +struct bb_name_state {
 +      const char *name;                       /* target function */
 +      bfd_vma address;                        /* Address of target function */
 +      const char *fname;                      /* optional from function name */
 +      const struct bb_mem_contains *mem;      /* expected memory state */
 +      const struct bb_reg_contains *regs;     /* expected register state */
 +      const unsigned short mem_size;          /* ARRAY_SIZE(mem) */
 +      const unsigned short regs_size;         /* ARRAY_SIZE(regs) */
 +      const short osp_offset;                 /* RSP in regs == OSP+osp_offset */
 +      const bbrgmask_t skip_mem;              /* Some slots in mem may be undefined */
 +      const bbrgmask_t skip_regs;             /* Some slots in regs may be undefined */
 +};
 +
 +/* NS (NAME_STATE) macros define the register and memory state when we transfer
 + * control to or start decoding a special case name.  Use NS when the target
 + * label always has the same state.  Use NS_FROM and specify the source label
 + * if the target state is slightly different depending on where it is branched
 + * from.  This gives better state checking, by isolating the special cases.
 + *
 + * Note: for the same target label, NS_FROM entries must be followed by a
 + * single NS entry.
 + */
 +
 +#define       NS_FROM(iname, ifname, imem, iregs, iskip_mem, iskip_regs, iosp_offset) \
 +      { \
 +              .name = iname, \
 +              .fname = ifname, \
 +              .mem = imem, \
 +              .regs = iregs, \
 +              .mem_size = ARRAY_SIZE(imem), \
 +              .regs_size = ARRAY_SIZE(iregs), \
 +              .skip_mem.bits[0] = iskip_mem, \
 +              .skip_regs.bits[0] = iskip_regs, \
 +              .osp_offset = iosp_offset, \
 +              .address = 0 \
 +      }
 +
 +/* Shorter forms for the common cases */
 +#define       NS(iname, imem, iregs, iskip_mem, iskip_regs, iosp_offset) \
 +        NS_FROM(iname, NULL, imem, iregs, iskip_mem, iskip_regs, iosp_offset)
 +#define       NS_MEM(iname, imem, iskip_mem) \
 +        NS_FROM(iname, NULL, imem, no_regs, iskip_mem, 0, 0)
 +#define       NS_MEM_FROM(iname, ifname, imem, iskip_mem) \
 +        NS_FROM(iname, ifname, imem, no_regs, iskip_mem, 0, 0)
 +#define       NS_REG(iname, iregs, iskip_regs) \
 +        NS_FROM(iname, NULL, no_memory, iregs, 0, iskip_regs, 0)
 +#define       NS_REG_FROM(iname, ifname, iregs, iskip_regs) \
 +        NS_FROM(iname, ifname, no_memory, iregs, 0, iskip_regs, 0)
 +
 +static void
 +bb_reg_code_set_value(enum bb_reg_code dst, enum bb_reg_code src);
 +
 +static const char *bb_mod_name, *bb_func_name;
 +
 +static int
 +bb_noret(const char *name)
 +{
 +      if (strcmp(name, "panic") == 0 ||
 +          strcmp(name, "do_exit") == 0 ||
 +          strcmp(name, "do_group_exit") == 0 ||
 +          strcmp(name, "complete_and_exit") == 0)
 +              return 1;
 +      return 0;
 +}
 +
 +/*============================================================================*/
 +/*                                                                            */
 +/* Most of the basic block code and data is common to x86_64 and i386.  This  */
 +/* large ifdef  contains almost all of the differences between the two        */
 +/* architectures.                                                             */
 +/*                                                                            */
 +/* Make sure you update the correct section of this ifdef.                    */
 +/*                                                                            */
 +/*============================================================================*/
 +
 +#ifdef        CONFIG_X86_64
 +
 +/* Registers that can be used to pass parameters, in the order that parameters
 + * are passed.
 + */
 +
 +const static enum bb_reg_code
 +bb_param_reg[] = {
 +      BBRG_RDI,
 +      BBRG_RSI,
 +      BBRG_RDX,
 +      BBRG_RCX,
 +      BBRG_R8,
 +      BBRG_R9,
 +};
 +
 +const static enum bb_reg_code
 +bb_preserved_reg[] = {
 +      BBRG_RBX,
 +      BBRG_RBP,
 +      BBRG_RSP,
 +      BBRG_R12,
 +      BBRG_R13,
 +      BBRG_R14,
 +      BBRG_R15,
 +};
 +
 +static const struct bb_mem_contains full_pt_regs[] = {
 +      { 0x70, BBRG_RDI },
 +      { 0x68, BBRG_RSI },
 +      { 0x60, BBRG_RDX },
 +      { 0x58, BBRG_RCX },
 +      { 0x50, BBRG_RAX },
 +      { 0x48, BBRG_R8  },
 +      { 0x40, BBRG_R9  },
 +      { 0x38, BBRG_R10 },
 +      { 0x30, BBRG_R11 },
 +      { 0x28, BBRG_RBX },
 +      { 0x20, BBRG_RBP },
 +      { 0x18, BBRG_R12 },
 +      { 0x10, BBRG_R13 },
 +      { 0x08, BBRG_R14 },
 +      { 0x00, BBRG_R15 },
 +};
 +static const struct bb_mem_contains full_pt_regs_plus_1[] = {
 +      { 0x78, BBRG_RDI },
 +      { 0x70, BBRG_RSI },
 +      { 0x68, BBRG_RDX },
 +      { 0x60, BBRG_RCX },
 +      { 0x58, BBRG_RAX },
 +      { 0x50, BBRG_R8  },
 +      { 0x48, BBRG_R9  },
 +      { 0x40, BBRG_R10 },
 +      { 0x38, BBRG_R11 },
 +      { 0x30, BBRG_RBX },
 +      { 0x28, BBRG_RBP },
 +      { 0x20, BBRG_R12 },
 +      { 0x18, BBRG_R13 },
 +      { 0x10, BBRG_R14 },
 +      { 0x08, BBRG_R15 },
 +};
 +/*
 + * Going into error_exit we have the hardware pushed error_code on the stack
 + * plus a full pt_regs
 + */
 +static const struct bb_mem_contains error_code_full_pt_regs[] = {
 +      { 0x78, BBRG_UNDEFINED },
 +      { 0x70, BBRG_RDI },
 +      { 0x68, BBRG_RSI },
 +      { 0x60, BBRG_RDX },
 +      { 0x58, BBRG_RCX },
 +      { 0x50, BBRG_RAX },
 +      { 0x48, BBRG_R8  },
 +      { 0x40, BBRG_R9  },
 +      { 0x38, BBRG_R10 },
 +      { 0x30, BBRG_R11 },
 +      { 0x28, BBRG_RBX },
 +      { 0x20, BBRG_RBP },
 +      { 0x18, BBRG_R12 },
 +      { 0x10, BBRG_R13 },
 +      { 0x08, BBRG_R14 },
 +      { 0x00, BBRG_R15 },
 +};
 +static const struct bb_mem_contains partial_pt_regs[] = {
 +      { 0x40, BBRG_RDI },
 +      { 0x38, BBRG_RSI },
 +      { 0x30, BBRG_RDX },
 +      { 0x28, BBRG_RCX },
 +      { 0x20, BBRG_RAX },
 +      { 0x18, BBRG_R8  },
 +      { 0x10, BBRG_R9  },
 +      { 0x08, BBRG_R10 },
 +      { 0x00, BBRG_R11 },
 +};
 +static const struct bb_mem_contains partial_pt_regs_plus_1[] = {
 +      { 0x48, BBRG_RDI },
 +      { 0x40, BBRG_RSI },
 +      { 0x38, BBRG_RDX },
 +      { 0x30, BBRG_RCX },
 +      { 0x28, BBRG_RAX },
 +      { 0x20, BBRG_R8  },
 +      { 0x18, BBRG_R9  },
 +      { 0x10, BBRG_R10 },
 +      { 0x08, BBRG_R11 },
 +};
 +static const struct bb_mem_contains partial_pt_regs_plus_2[] = {
 +      { 0x50, BBRG_RDI },
 +      { 0x48, BBRG_RSI },
 +      { 0x40, BBRG_RDX },
 +      { 0x38, BBRG_RCX },
 +      { 0x30, BBRG_RAX },
 +      { 0x28, BBRG_R8  },
 +      { 0x20, BBRG_R9  },
 +      { 0x18, BBRG_R10 },
 +      { 0x10, BBRG_R11 },
 +};
 +static const struct bb_mem_contains no_memory[] = {
 +};
 +/* Hardware has already pushed an error_code on the stack.  Use undefined just
 + * to set the initial stack offset.
 + */
 +static const struct bb_mem_contains error_code[] = {
 +      { 0x0, BBRG_UNDEFINED },
 +};
 +/* error_code plus original rax */
 +static const struct bb_mem_contains error_code_rax[] = {
 +      { 0x8, BBRG_UNDEFINED },
 +      { 0x0, BBRG_RAX },
 +};
 +
 +static const struct bb_reg_contains all_regs[] = {
 +      [BBRG_RAX] = { BBRG_RAX, 0 },
 +      [BBRG_RBX] = { BBRG_RBX, 0 },
 +      [BBRG_RCX] = { BBRG_RCX, 0 },
 +      [BBRG_RDX] = { BBRG_RDX, 0 },
 +      [BBRG_RDI] = { BBRG_RDI, 0 },
 +      [BBRG_RSI] = { BBRG_RSI, 0 },
 +      [BBRG_RBP] = { BBRG_RBP, 0 },
 +      [BBRG_RSP] = { BBRG_OSP, 0 },
 +      [BBRG_R8 ] = { BBRG_R8,  0 },
 +      [BBRG_R9 ] = { BBRG_R9,  0 },
 +      [BBRG_R10] = { BBRG_R10, 0 },
 +      [BBRG_R11] = { BBRG_R11, 0 },
 +      [BBRG_R12] = { BBRG_R12, 0 },
 +      [BBRG_R13] = { BBRG_R13, 0 },
 +      [BBRG_R14] = { BBRG_R14, 0 },
 +      [BBRG_R15] = { BBRG_R15, 0 },
 +};
 +static const struct bb_reg_contains no_regs[] = {
 +};
 +
 +static struct bb_name_state bb_special_cases[] = {
 +
 +      /* First the cases that pass data only in memory.  We do not check any
 +       * register state for these cases.
 +       */
 +
 +      /* Simple cases, no exceptions */
 +      NS_MEM("ia32_ptregs_common", partial_pt_regs_plus_1, 0),
 +      NS_MEM("ia32_sysret", partial_pt_regs, 0),
 +      NS_MEM("int_careful", partial_pt_regs, 0),
 +      NS_MEM("ia32_badarg", partial_pt_regs, 0),
 +      NS_MEM("int_restore_rest", full_pt_regs, 0),
 +      NS_MEM("int_signal", full_pt_regs, 0),
 +      NS_MEM("int_very_careful", partial_pt_regs, 0),
 +      NS_MEM("ptregscall_common", full_pt_regs_plus_1, 0),
 +      NS_MEM("ret_from_intr", partial_pt_regs_plus_2, 0),
 +      NS_MEM("stub32_clone", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub32_execve", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub32_fork", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub32_iopl", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub32_rt_sigreturn", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub32_sigaltstack", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub32_sigreturn", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub32_vfork", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub_clone", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub_execve", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub_fork", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub_iopl", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub_rt_sigreturn", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub_sigaltstack", partial_pt_regs_plus_1, 0),
 +      NS_MEM("stub_vfork", partial_pt_regs_plus_1, 0),
 +      NS_MEM("sysenter_auditsys", partial_pt_regs,
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11)),
 +
 +      NS_MEM("paranoid_exit", error_code_full_pt_regs, 0),
 +
 +      NS_MEM_FROM("ia32_badsys", "ia32_sysenter_target",
 +              partial_pt_regs,
 +              /* ia32_sysenter_target uses CLEAR_RREGS to clear R8-R11 on
 +               * some paths.  It also stomps on RAX.
 +               */
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
 +              BB_SKIP(RAX)),
 +      NS_MEM_FROM("ia32_badsys", "ia32_cstar_target",
 +              partial_pt_regs,
 +              /* ia32_cstar_target uses CLEAR_RREGS to clear R8-R11 on some
 +               * paths.  It also stomps on RAX.  Even more confusing, instead
 +               * of storing RCX it stores RBP.  WTF?
 +               */
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +      NS_MEM_FROM("ia32_badsys", "ia32_syscall",
 +              partial_pt_regs,
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11)),
 +      NS_MEM("ia32_badsys", partial_pt_regs, 0),
 +
 +#ifdef CONFIG_AUDITSYSCALL
 +      NS_MEM_FROM("int_with_check", "sysexit_audit", partial_pt_regs,
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
 +              BB_SKIP(RAX)),
 +      NS_MEM_FROM("int_with_check", "ia32_cstar_target", partial_pt_regs,
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +#endif
 +      NS_MEM("int_with_check", no_memory, 0),
 +
 +      /* Various bits of code branch to int_ret_from_sys_call, with slightly
 +       * different missing values in pt_regs.
 +       */
 +      NS_MEM_FROM("int_ret_from_sys_call", "ret_from_fork",
 +              partial_pt_regs,
 +              BB_SKIP(R11)),
 +      NS_MEM_FROM("int_ret_from_sys_call", "stub_execve",
 +              partial_pt_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +      NS_MEM_FROM("int_ret_from_sys_call", "stub_rt_sigreturn",
 +              partial_pt_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +      NS_MEM_FROM("int_ret_from_sys_call", "kernel_execve",
 +              partial_pt_regs,
 +              BB_SKIP(RAX)),
 +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_syscall",
 +              partial_pt_regs,
 +              /* ia32_syscall only saves RDI through RCX. */
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
 +              BB_SKIP(RAX)),
 +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_sysenter_target",
 +              partial_pt_regs,
 +              /* ia32_sysenter_target uses CLEAR_RREGS to clear R8-R11 on
 +              * some paths.  It also stomps on RAX.
 +              */
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
 +              BB_SKIP(RAX)),
 +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_cstar_target",
 +              partial_pt_regs,
 +              /* ia32_cstar_target uses CLEAR_RREGS to clear R8-R11 on some
 +               * paths.  It also stomps on RAX.  Even more confusing, instead
 +               * of storing RCX it stores RBP.  WTF?
 +               */
 +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_badsys",
 +              partial_pt_regs, BB_SKIP(RAX)),
 +      NS_MEM("int_ret_from_sys_call", partial_pt_regs, 0),
 +
 +#ifdef        CONFIG_PREEMPT
 +      NS_MEM("retint_kernel", partial_pt_regs, BB_SKIP(RAX)),
 +#endif        /* CONFIG_PREEMPT */
 +
 +      NS_MEM("retint_careful", partial_pt_regs, BB_SKIP(RAX)),
 +
 +      /* Horrible hack: For a brand new x86_64 task, switch_to() branches to
 +       * ret_from_fork with a totally different stack state from all the
 +       * other tasks that come out of switch_to().  This non-standard state
 +       * cannot be represented so just ignore the branch from switch_to() to
 +       * ret_from_fork.  Due to inlining and linker labels, switch_to() can
 +       * appear as several different function labels, including schedule,
 +       * context_switch and __sched_text_start.
 +       */
 +      NS_MEM_FROM("ret_from_fork", "schedule", no_memory, 0),
 +      NS_MEM_FROM("ret_from_fork", "__schedule", no_memory, 0),
 +      NS_MEM_FROM("ret_from_fork", "__sched_text_start", no_memory, 0),
 +      NS_MEM_FROM("ret_from_fork", "context_switch", no_memory, 0),
 +      NS_MEM("ret_from_fork", full_pt_regs, 0),
 +
 +      NS_MEM_FROM("ret_from_sys_call", "ret_from_fork",
 +              partial_pt_regs,
 +              BB_SKIP(R11)),
 +      NS_MEM("ret_from_sys_call", partial_pt_regs, 0),
 +
 +      NS_MEM("retint_restore_args",
 +              partial_pt_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +
 +      NS_MEM("retint_swapgs",
 +              partial_pt_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +
 +      /* Now the cases that pass data in registers.  We do not check any
 +       * memory state for these cases.
 +       */
 +
 +      NS_REG("bad_put_user",
 +              all_regs, BB_SKIP(RBX)),
 +
 +      NS_REG("bad_get_user",
 +              all_regs, BB_SKIP(RAX) | BB_SKIP(RDX)),
 +
 +      NS_REG("bad_to_user",
 +              all_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +
 +      NS_REG("ia32_ptregs_common",
 +              all_regs,
 +              0),
 +
 +      NS_REG("copy_user_generic_unrolled",
 +              all_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +
 +      NS_REG("copy_user_generic_string",
 +              all_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RCX)),
 +
 +      NS_REG("irq_return",
 +              all_regs,
 +              0),
 +
 +      /* Finally the cases that pass data in both registers and memory.
 +       */
 +
 +      NS("invalid_TSS", error_code, all_regs, 0, 0, 0),
 +      NS("segment_not_present", error_code, all_regs, 0, 0, 0),
 +      NS("alignment_check", error_code, all_regs, 0, 0, 0),
 +      NS("page_fault", error_code, all_regs, 0, 0, 0),
 +      NS("general_protection", error_code, all_regs, 0, 0, 0),
 +      NS("error_entry", error_code_rax, all_regs, 0, BB_SKIP(RAX), -0x10),
 +      NS("error_exit", error_code_full_pt_regs, no_regs, 0, 0, 0x30),
 +      NS("common_interrupt", error_code, all_regs, 0, 0, -0x8),
 +      NS("save_args", error_code, all_regs, 0, 0, -0x50),
 +      NS("int3", no_memory, all_regs, 0, 0, -0x80),
 +};
 +
 +static const char *bb_spurious[] = {
 +                              /* schedule */
 +      "thread_return",
 +                              /* system_call */
 +      "system_call_after_swapgs",
 +      "system_call_fastpath",
 +      "ret_from_sys_call",
 +      "sysret_check",
 +      "sysret_careful",
 +      "sysret_signal",
 +      "badsys",
 +#ifdef CONFIG_AUDITSYSCALL
 +      "auditsys",
 +      "sysret_audit",
 +#endif
 +      "tracesys",
 +      "int_ret_from_sys_call",
 +      "int_with_check",
 +      "int_careful",
 +      "int_very_careful",
 +      "int_signal",
 +      "int_restore_rest",
 +                              /* common_interrupt */
 +      "ret_from_intr",
 +      "exit_intr",
 +      "retint_with_reschedule",
 +      "retint_check",
 +      "retint_swapgs",
 +      "retint_restore_args",
 +      "restore_args",
 +      "irq_return",
 +      "bad_iret",
 +      "retint_careful",
 +      "retint_signal",
 +#ifdef        CONFIG_PREEMPT
 +      "retint_kernel",
 +#endif        /* CONFIG_PREEMPT */
 +                              /* paranoid_exit */
 +      "paranoid_swapgs",
 +      "paranoid_restore",
 +      "paranoid_userspace",
 +      "paranoid_schedule",
 +                              /* error_entry */
 +      "error_swapgs",
 +      "error_sti",
 +      "error_kernelspace",
 +                              /* nmi */
 +#ifdef CONFIG_TRACE_IRQFLAGS
 +      "nmi_swapgs",
 +      "nmi_restore",
 +      "nmi_userspace",
 +      "nmi_schedule",
 +#endif
 +                              /* load_gs_index */
 +      "gs_change",
 +      "bad_gs",
 +                              /* ia32_sysenter_target */
 +      "sysenter_do_call",
 +      "sysenter_dispatch",
 +      "sysexit_from_sys_call",
 +#ifdef CONFIG_AUDITSYSCALL
 +      "sysenter_auditsys",
 +      "sysexit_audit",
 +#endif
 +      "sysenter_tracesys",
 +                              /* ia32_cstar_target */
 +      "cstar_do_call",
 +      "cstar_dispatch",
 +      "sysretl_from_sys_call",
 +#ifdef CONFIG_AUDITSYSCALL
 +      "cstar_auditsys",
 +      "sysretl_audit",
 +#endif
 +      "cstar_tracesys",
 +                              /* ia32_syscall */
 +      "ia32_do_call",
 +      "ia32_sysret",
 +      "ia32_tracesys",
 +#ifdef        CONFIG_HIBERNATION
 +                              /* restore_image */
 +      "loop",
 +      "done",
 +#endif        /* CONFIG_HIBERNATION */
 +#ifdef        CONFIG_KPROBES
 +                              /* jprobe_return */
 +      "jprobe_return_end",
 +                              /* kretprobe_trampoline_holder */
 +      "kretprobe_trampoline",
 +#endif        /* CONFIG_KPROBES */
 +#ifdef        CONFIG_KEXEC
 +                              /* relocate_kernel */
 +      "relocate_new_kernel",
 +#endif        /* CONFIG_KEXEC */
- #ifdef        CONFIG_PARAVIRT_XEN
++#ifdef        CONFIG_XEN
 +                              /* arch/i386/xen/xen-asm.S */
 +      "xen_irq_enable_direct_end",
 +      "xen_irq_disable_direct_end",
 +      "xen_save_fl_direct_end",
 +      "xen_restore_fl_direct_end",
 +      "xen_iret_start_crit",
 +      "iret_restore_end",
 +      "xen_iret_end_crit",
 +      "hyper_iret",
 +#endif        /* CONFIG_XEN */
 +};
 +
 +static const char *bb_hardware_handlers[] = {
 +      "system_call",
 +      "common_interrupt",
 +      "error_entry",
 +      "debug",
 +      "nmi",
 +      "int3",
 +      "double_fault",
 +      "stack_segment",
 +      "machine_check",
 +      "kdb_call",
 +};
 +
 +static int
 +bb_hardware_pushed_arch(kdb_machreg_t rsp,
 +                      const struct kdb_activation_record *ar)
 +{
 +      /* x86_64 interrupt stacks are 16 byte aligned and you must get the
 +       * next rsp from stack, it cannot be statically calculated.  Do not
 +       * include the word at rsp, it is pushed by hardware but is treated as
 +       * a normal software return value.
 +       *
 +       * When an IST switch occurs (e.g. NMI) then the saved rsp points to
 +       * another stack entirely.  Assume that the IST stack is 16 byte
 +       * aligned and just return the size of the hardware data on this stack.
 +       * The stack unwind code will take care of the stack switch.
 +       */
 +      kdb_machreg_t saved_rsp = *((kdb_machreg_t *)rsp + 3);
 +      int hardware_pushed = saved_rsp - rsp - KDB_WORD_SIZE;
 +      if (hardware_pushed < 4 * KDB_WORD_SIZE ||
 +          saved_rsp < ar->stack.logical_start ||
 +          saved_rsp >= ar->stack.logical_end)
 +              return 4 * KDB_WORD_SIZE;
 +      else
 +              return hardware_pushed;
 +}
 +
 +static void
 +bb_start_block0(void)
 +{
 +      bb_reg_code_set_value(BBRG_RAX, BBRG_RAX);
 +      bb_reg_code_set_value(BBRG_RBX, BBRG_RBX);
 +      bb_reg_code_set_value(BBRG_RCX, BBRG_RCX);
 +      bb_reg_code_set_value(BBRG_RDX, BBRG_RDX);
 +      bb_reg_code_set_value(BBRG_RDI, BBRG_RDI);
 +      bb_reg_code_set_value(BBRG_RSI, BBRG_RSI);
 +      bb_reg_code_set_value(BBRG_RBP, BBRG_RBP);
 +      bb_reg_code_set_value(BBRG_RSP, BBRG_OSP);
 +      bb_reg_code_set_value(BBRG_R8, BBRG_R8);
 +      bb_reg_code_set_value(BBRG_R9, BBRG_R9);
 +      bb_reg_code_set_value(BBRG_R10, BBRG_R10);
 +      bb_reg_code_set_value(BBRG_R11, BBRG_R11);
 +      bb_reg_code_set_value(BBRG_R12, BBRG_R12);
 +      bb_reg_code_set_value(BBRG_R13, BBRG_R13);
 +      bb_reg_code_set_value(BBRG_R14, BBRG_R14);
 +      bb_reg_code_set_value(BBRG_R15, BBRG_R15);
 +}
 +
 +/* x86_64 does not have a special case for __switch_to */
 +
 +static void
 +bb_fixup_switch_to(char *p)
 +{
 +}
 +
 +static int
 +bb_asmlinkage_arch(void)
 +{
 +      return strncmp(bb_func_name, "__down", 6) == 0 ||
 +             strncmp(bb_func_name, "__up", 4) == 0 ||
 +             strncmp(bb_func_name, "stub_", 5) == 0 ||
 +             strcmp(bb_func_name, "ret_from_fork") == 0 ||
 +             strcmp(bb_func_name, "ptregscall_common") == 0;
 +}
 +
 +#else /* !CONFIG_X86_64 */
 +
 +/* Registers that can be used to pass parameters, in the order that parameters
 + * are passed.
 + */
 +
 +const static enum bb_reg_code
 +bb_param_reg[] = {
 +      BBRG_RAX,
 +      BBRG_RDX,
 +      BBRG_RCX,
 +};
 +
 +const static enum bb_reg_code
 +bb_preserved_reg[] = {
 +      BBRG_RBX,
 +      BBRG_RBP,
 +      BBRG_RSP,
 +      BBRG_RSI,
 +      BBRG_RDI,
 +};
 +
 +static const struct bb_mem_contains full_pt_regs[] = {
 +      { 0x18, BBRG_RAX },
 +      { 0x14, BBRG_RBP },
 +      { 0x10, BBRG_RDI },
 +      { 0x0c, BBRG_RSI },
 +      { 0x08, BBRG_RDX },
 +      { 0x04, BBRG_RCX },
 +      { 0x00, BBRG_RBX },
 +};
 +static const struct bb_mem_contains no_memory[] = {
 +};
 +/* Hardware has already pushed an error_code on the stack.  Use undefined just
 + * to set the initial stack offset.
 + */
 +static const struct bb_mem_contains error_code[] = {
 +      { 0x0, BBRG_UNDEFINED },
 +};
 +/* rbx already pushed */
 +static const struct bb_mem_contains rbx_pushed[] = {
 +      { 0x0, BBRG_RBX },
 +};
 +#ifdef        CONFIG_MATH_EMULATION
 +static const struct bb_mem_contains mem_fpu_reg_round[] = {
 +      { 0xc, BBRG_RBP },
 +      { 0x8, BBRG_RSI },
 +      { 0x4, BBRG_RDI },
 +      { 0x0, BBRG_RBX },
 +};
 +#endif        /* CONFIG_MATH_EMULATION */
 +
 +static const struct bb_reg_contains all_regs[] = {
 +      [BBRG_RAX] = { BBRG_RAX, 0 },
 +      [BBRG_RBX] = { BBRG_RBX, 0 },
 +      [BBRG_RCX] = { BBRG_RCX, 0 },
 +      [BBRG_RDX] = { BBRG_RDX, 0 },
 +      [BBRG_RDI] = { BBRG_RDI, 0 },
 +      [BBRG_RSI] = { BBRG_RSI, 0 },
 +      [BBRG_RBP] = { BBRG_RBP, 0 },
 +      [BBRG_RSP] = { BBRG_OSP, 0 },
 +};
 +static const struct bb_reg_contains no_regs[] = {
 +};
 +#ifdef        CONFIG_MATH_EMULATION
 +static const struct bb_reg_contains reg_fpu_reg_round[] = {
 +      [BBRG_RBP] = { BBRG_OSP, -0x4 },
 +      [BBRG_RSP] = { BBRG_OSP, -0x10 },
 +};
 +#endif        /* CONFIG_MATH_EMULATION */
 +
 +static struct bb_name_state bb_special_cases[] = {
 +
 +      /* First the cases that pass data only in memory.  We do not check any
 +       * register state for these cases.
 +       */
 +
 +      /* Simple cases, no exceptions */
 +      NS_MEM("check_userspace", full_pt_regs, 0),
 +      NS_MEM("device_not_available_emulate", full_pt_regs, 0),
 +      NS_MEM("ldt_ss", full_pt_regs, 0),
 +      NS_MEM("no_singlestep", full_pt_regs, 0),
 +      NS_MEM("restore_all", full_pt_regs, 0),
 +      NS_MEM("restore_nocheck", full_pt_regs, 0),
 +      NS_MEM("restore_nocheck_notrace", full_pt_regs, 0),
 +      NS_MEM("ret_from_exception", full_pt_regs, 0),
 +      NS_MEM("ret_from_fork", full_pt_regs, 0),
 +      NS_MEM("ret_from_intr", full_pt_regs, 0),
 +      NS_MEM("work_notifysig", full_pt_regs, 0),
 +      NS_MEM("work_pending", full_pt_regs, 0),
 +
 +#ifdef        CONFIG_PREEMPT
 +      NS_MEM("resume_kernel", full_pt_regs, 0),
 +#endif        /* CONFIG_PREEMPT */
 +
 +      NS_MEM("common_interrupt", error_code, 0),
 +      NS_MEM("error_code", error_code, 0),
 +
 +      NS_MEM("bad_put_user", rbx_pushed, 0),
 +
 +      NS_MEM_FROM("resume_userspace", "syscall_badsys",
 +              full_pt_regs, BB_SKIP(RAX)),
 +      NS_MEM_FROM("resume_userspace", "syscall_fault",
 +              full_pt_regs, BB_SKIP(RAX)),
 +      NS_MEM_FROM("resume_userspace", "syscall_trace_entry",
 +              full_pt_regs, BB_SKIP(RAX)),
 +      /* Too difficult to trace through the various vm86 functions for now.
 +       * They are C functions that start off with some memory state, fiddle
 +       * the registers then jmp directly to resume_userspace.  For the
 +       * moment, just assume that they are valid and do no checks.
 +       */
 +      NS_FROM("resume_userspace", "do_int",
 +              no_memory, no_regs, 0, 0, 0),
 +      NS_FROM("resume_userspace", "do_sys_vm86",
 +              no_memory, no_regs, 0, 0, 0),
 +      NS_FROM("resume_userspace", "handle_vm86_fault",
 +              no_memory, no_regs, 0, 0, 0),
 +      NS_FROM("resume_userspace", "handle_vm86_trap",
 +              no_memory, no_regs, 0, 0, 0),
 +      NS_MEM("resume_userspace", full_pt_regs, 0),
 +
 +      NS_MEM_FROM("syscall_badsys", "ia32_sysenter_target",
 +              full_pt_regs, BB_SKIP(RBP)),
 +      NS_MEM("syscall_badsys", full_pt_regs, 0),
 +
 +      NS_MEM_FROM("syscall_call", "syscall_trace_entry",
 +              full_pt_regs, BB_SKIP(RAX)),
 +      NS_MEM("syscall_call", full_pt_regs, 0),
 +
 +      NS_MEM_FROM("syscall_exit", "syscall_trace_entry",
 +              full_pt_regs, BB_SKIP(RAX)),
 +      NS_MEM("syscall_exit", full_pt_regs, 0),
 +
 +      NS_MEM_FROM("syscall_exit_work", "ia32_sysenter_target",
 +              full_pt_regs, BB_SKIP(RAX) | BB_SKIP(RBP)),
 +      NS_MEM_FROM("syscall_exit_work", "system_call",
 +              full_pt_regs, BB_SKIP(RAX)),
 +      NS_MEM("syscall_exit_work", full_pt_regs, 0),
 +
 +      NS_MEM_FROM("syscall_trace_entry", "ia32_sysenter_target",
 +              full_pt_regs, BB_SKIP(RBP)),
 +      NS_MEM_FROM("syscall_trace_entry", "system_call",
 +              full_pt_regs, BB_SKIP(RAX)),
 +      NS_MEM("syscall_trace_entry", full_pt_regs, 0),
 +
 +      /* Now the cases that pass data in registers.  We do not check any
 +       * memory state for these cases.
 +       */
 +
 +      NS_REG("syscall_fault", all_regs, 0),
 +
 +      NS_REG("bad_get_user", all_regs,
 +              BB_SKIP(RAX) | BB_SKIP(RDX)),
 +
 +      /* Finally the cases that pass data in both registers and memory.
 +      */
 +
 +      /* This entry is redundant now because bb_fixup_switch_to() hides the
 +       * jmp __switch_to case, however the entry is left here as
 +       * documentation.
 +       *
 +       * NS("__switch_to", no_memory, no_regs, 0, 0, 0),
 +       */
 +
 +      NS("iret_exc", no_memory, all_regs, 0, 0, 0x20),
 +
 +#ifdef        CONFIG_MATH_EMULATION
 +      NS("fpu_reg_round", mem_fpu_reg_round, reg_fpu_reg_round, 0, 0, 0),
 +#endif        /* CONFIG_MATH_EMULATION */
 +};
 +
 +static const char *bb_spurious[] = {
 +                              /* ret_from_exception */
 +      "ret_from_intr",
 +      "check_userspace",
 +      "resume_userspace",
 +                              /* resume_kernel */
 +#ifdef        CONFIG_PREEMPT
 +      "need_resched",
 +#endif        /* CONFIG_PREEMPT */
 +                              /* ia32_sysenter_target */
 +      "sysenter_past_esp",
 +                              /* system_call */
 +      "no_singlestep",
 +      "syscall_call",
 +      "syscall_exit",
 +      "restore_all",
 +      "restore_nocheck",
 +      "restore_nocheck_notrace",
 +      "ldt_ss",
 +      /* do not include iret_exc, it is in a .fixup section */
 +                              /* work_pending */
 +      "work_resched",
 +      "work_notifysig",
 +#ifdef        CONFIG_VM86
 +      "work_notifysig_v86",
 +#endif        /* CONFIG_VM86 */
 +                              /* page_fault */
 +      "error_code",
 +                              /* device_not_available */
 +      "device_not_available_emulate",
 +                              /* debug */
 +      "debug_esp_fix_insn",
 +      "debug_stack_correct",
 +                              /* nmi */
 +      "nmi_stack_correct",
 +      "nmi_stack_fixup",
 +      "nmi_debug_stack_check",
 +      "nmi_espfix_stack",
 +#ifdef        CONFIG_HIBERNATION
 +                              /* restore_image */
 +      "copy_loop",
 +      "done",
 +#endif        /* CONFIG_HIBERNATION */
 +#ifdef        CONFIG_KPROBES
 +                              /* jprobe_return */
 +      "jprobe_return_end",
 +#endif        /* CONFIG_KPROBES */
 +#ifdef        CONFIG_KEXEC
 +                              /* relocate_kernel */
 +      "relocate_new_kernel",
 +#endif        /* CONFIG_KEXEC */
 +#ifdef        CONFIG_MATH_EMULATION
 +                              /* assorted *.S files in arch/i386/math_emu */
 +      "Denorm_done",
 +      "Denorm_shift_more_than_32",
 +      "Denorm_shift_more_than_63",
 +      "Denorm_shift_more_than_64",
 +      "Do_unmasked_underflow",
 +      "Exp_not_underflow",
 +      "fpu_Arith_exit",
 +      "fpu_reg_round",
 +      "fpu_reg_round_signed_special_exit",
 +      "fpu_reg_round_special_exit",
 +      "L_accum_done",
 +      "L_accum_loaded",
 +      "L_accum_loop",
 +      "L_arg1_larger",
 +      "L_bugged",
 +      "L_bugged_1",
 +      "L_bugged_2",
 +      "L_bugged_3",
 +      "L_bugged_4",
 +      "L_bugged_denorm_486",
 +      "L_bugged_round24",
 +      "L_bugged_round53",
 +      "L_bugged_round64",
 +      "LCheck_24_round_up",
 +      "LCheck_53_round_up",
 +      "LCheck_Round_Overflow",
 +      "LCheck_truncate_24",
 +      "LCheck_truncate_53",
 +      "LCheck_truncate_64",
 +      "LDenormal_adj_exponent",
 +      "L_deNormalised",
 +      "LDo_24_round_up",
 +      "LDo_2nd_32_bits",
 +      "LDo_2nd_div",
 +      "LDo_3rd_32_bits",
 +      "LDo_3rd_div",
 +      "LDo_53_round_up",
 +      "LDo_64_round_up",
 +      "L_done",
 +      "LDo_truncate_24",
 +      "LDown_24",
 +      "LDown_53",
 +      "LDown_64",
 +      "L_entry_bugged",
 +      "L_error_exit",
 +      "L_exactly_32",
 +      "L_exception_exit",
 +      "L_exit",
 +      "L_exit_nuo_valid",
 +      "L_exit_nuo_zero",
 +      "L_exit_valid",
 +      "L_extent_zero",
 +      "LFirst_div_done",
 +      "LFirst_div_not_1",
 +      "L_Full_Division",
 +      "LGreater_Half_24",
 +      "LGreater_Half_53",
 +      "LGreater_than_1",
 +      "LLess_than_1",
 +      "L_Make_denorm",
 +      "L_more_31_no_low",
 +      "L_more_63_no_low",
 +      "L_more_than_31",
 +      "L_more_than_63",
 +      "L_more_than_64",
 +      "L_more_than_65",
 +      "L_more_than_95",
 +      "L_must_be_zero",
 +      "L_n_exit",
 +      "L_no_adjust",
 +      "L_no_bit_lost",
 +      "L_no_overflow",
 +      "L_no_precision_loss",
 +      "L_Normalised",
 +      "L_norm_bugged",
 +      "L_n_shift_1",
 +      "L_nuo_shift_1",
 +      "L_overflow",
 +      "L_precision_lost_down",
 +      "L_precision_lost_up",
 +      "LPrevent_2nd_overflow",
 +      "LPrevent_3rd_overflow",
 +      "LPseudoDenormal",
 +      "L_Re_normalise",
 +      "LResult_Normalised",
 +      "L_round",
 +      "LRound_large",
 +      "LRound_nearest_24",
 +      "LRound_nearest_53",
 +      "LRound_nearest_64",
 +      "LRound_not_small",
 +      "LRound_ovfl",
 +      "LRound_precision",
 +      "LRound_prep",
 +      "L_round_the_result",
 +      "LRound_To_24",
 +      "LRound_To_53",
 +      "LRound_To_64",
 +      "LSecond_div_done",
 +      "LSecond_div_not_1",
 +      "L_shift_1",
 +      "L_shift_32",
 +      "L_shift_65_nc",
 +      "L_shift_done",
 +      "Ls_less_than_32",
 +      "Ls_more_than_63",
 +      "Ls_more_than_95",
 +      "L_Store_significand",
 +      "L_subtr",
 +      "LTest_over",
 +      "LTruncate_53",
 +      "LTruncate_64",
 +      "L_underflow",
 +      "L_underflow_to_zero",
 +      "LUp_24",
 +      "LUp_53",
 +      "LUp_64",
 +      "L_zero",
 +      "Normalise_result",
 +      "Signal_underflow",
 +      "sqrt_arg_ge_2",
 +      "sqrt_get_more_precision",
 +      "sqrt_more_prec_large",
 +      "sqrt_more_prec_ok",
 +      "sqrt_more_prec_small",
 +      "sqrt_near_exact",
 +      "sqrt_near_exact_large",
 +      "sqrt_near_exact_ok",
 +      "sqrt_near_exact_small",
 +      "sqrt_near_exact_x",
 +      "sqrt_prelim_no_adjust",
 +      "sqrt_round_result",
 +      "sqrt_stage_2_done",
 +      "sqrt_stage_2_error",
 +      "sqrt_stage_2_finish",
 +      "sqrt_stage_2_positive",
 +      "sqrt_stage_3_error",
 +      "sqrt_stage_3_finished",
 +      "sqrt_stage_3_no_error",
 +      "sqrt_stage_3_positive",
 +      "Unmasked_underflow",
 +      "xExp_not_underflow",
 +#endif        /* CONFIG_MATH_EMULATION */
 +};
 +
 +static const char *bb_hardware_handlers[] = {
 +      "ret_from_exception",
 +      "system_call",
 +      "work_pending",
 +      "syscall_fault",
 +      "page_fault",
 +      "coprocessor_error",
 +      "simd_coprocessor_error",
 +      "device_not_available",
 +      "debug",
 +      "nmi",
 +      "int3",
 +      "overflow",
 +      "bounds",
 +      "invalid_op",
 +      "coprocessor_segment_overrun",
 +      "invalid_TSS",
 +      "segment_not_present",
 +      "stack_segment",
 +      "general_protection",
 +      "alignment_check",
 +      "kdb_call",
 +      "divide_error",
 +      "machine_check",
 +      "spurious_interrupt_bug",
 +};
 +
 +static int
 +bb_hardware_pushed_arch(kdb_machreg_t rsp,
 +                      const struct kdb_activation_record *ar)
 +{
 +      return (2 * KDB_WORD_SIZE);
 +}
 +
 +static void
 +bb_start_block0(void)
 +{
 +      bb_reg_code_set_value(BBRG_RAX, BBRG_RAX);
 +      bb_reg_code_set_value(BBRG_RBX, BBRG_RBX);
 +      bb_reg_code_set_value(BBRG_RCX, BBRG_RCX);
 +      bb_reg_code_set_value(BBRG_RDX, BBRG_RDX);
 +      bb_reg_code_set_value(BBRG_RDI, BBRG_RDI);
 +      bb_reg_code_set_value(BBRG_RSI, BBRG_RSI);
 +      bb_reg_code_set_value(BBRG_RBP, BBRG_RBP);
 +      bb_reg_code_set_value(BBRG_RSP, BBRG_OSP);
 +}
 +
 +/* The i386 code that switches stack in a context switch is an extremely
 + * special case.  It saves the rip pointing to a label that is not otherwise
 + * referenced, saves the current rsp then pushes a word.  The magic code that
 + * resumes the new task picks up the saved rip and rsp, effectively referencing
 + * a label that otherwise is not used and ignoring the pushed word.
 + *
 + * The simplest way to handle this very strange case is to recognise jmp
 + * address <__switch_to> and treat it as a popfl instruction.  This avoids
 + * terminating the block on this jmp and removes one word from the stack state,
 + * which is the end effect of all the magic code.
 + *
 + * Called with the instruction line, starting after the first ':'.
 + */
 +
 +static void
 +bb_fixup_switch_to(char *p)
 +{
 +      char *p1 = p;
 +      p += strspn(p, " \t");          /* start of instruction */
 +      if (strncmp(p, "jmp", 3))
 +              return;
 +      p += strcspn(p, " \t");         /* end of instruction */
 +      p += strspn(p, " \t");          /* start of address */
 +      p += strcspn(p, " \t");         /* end of address */
 +      p += strspn(p, " \t");          /* start of comment */
 +      if (strcmp(p, "<__switch_to>") == 0)
 +              strcpy(p1, "popfl");
 +}
 +
 +static int
 +bb_asmlinkage_arch(void)
 +{
 +      return strcmp(bb_func_name, "ret_from_exception") == 0 ||
 +             strcmp(bb_func_name, "syscall_trace_entry") == 0;
 +}
 +
 +#endif        /* CONFIG_X86_64 */
 +
 +
 +/*============================================================================*/
 +/*                                                                            */
 +/* Common code and data.                                                      */
 +/*                                                                            */
 +/*============================================================================*/
 +
 +
 +/* Tracking registers by decoding the instructions is quite a bit harder than
 + * doing the same tracking using compiler generated information.  Register
 + * contents can remain in the same register, they can be copied to other
 + * registers, they can be stored on stack or they can be modified/overwritten.
 + * At any one time, there are 0 or more copies of the original value that was
 + * supplied in each register on input to the current function.  If a register
 + * exists in multiple places, one copy of that register is the master version,
 + * the others are temporary copies which may or may not be destroyed before the
 + * end of the function.
 + *
 + * The compiler knows which copy of a register is the master and which are
 + * temporary copies, which makes it relatively easy to track register contents
 + * as they are saved and restored.  Without that compiler based knowledge, this
 + * code has to track _every_ possible copy of each register, simply because we
 + * do not know which is the master copy and which are temporary copies which
 + * may be destroyed later.
 + *
 + * It gets worse: registers that contain parameters can be copied to other
 + * registers which are then saved on stack in a lower level function.  Also the
 + * stack pointer may be held in multiple registers (typically RSP and RBP)
 + * which contain different offsets from the base of the stack on entry to this
 + * function.  All of which means that we have to track _all_ register
 + * movements, or at least as much as possible.
 + *
 + * Start with the basic block that contains the start of the function, by
 + * definition all registers contain their initial value.  Track each
 + * instruction's effect on register contents, this includes reading from a
 + * parameter register before any write to that register, IOW the register
 + * really does contain a parameter.  The register state is represented by a
 + * dynamically sized array with each entry containing :-
 + *
 + *   Register name
 + *   Location it is copied to (another register or stack + offset)
 + *
 + * Besides the register tracking array, we track which parameter registers are
 + * read before being written, to determine how many parameters are passed in
 + * registers.  We also track which registers contain stack pointers, including
 + * their offset from the original stack pointer on entry to the function.
 + *
 + * At each exit from the current basic block (via JMP instruction or drop
 + * through), the register state is cloned to form the state on input to the
 + * target basic block and the target is marked for processing using this state.
 + * When there are multiple ways to enter a basic block (e.g. several JMP
 + * instructions referencing the same target) then there will be multiple sets
 + * of register state to form the "input" for that basic block, there is no
 + * guarantee that all paths to that block will have the same register state.
 + *
 + * As each target block is processed, all the known sets of register state are
 + * merged to form a suitable subset of the state which agrees with all the
 + * inputs.  The most common case is where one path to this block copies a
 + * register to another register but another path does not, therefore the copy
 + * is only a temporary and should not be propogated into this block.
 + *
 + * If the target block already has an input state from the current transfer
 + * point and the new input state is identical to the previous input state then
 + * we have reached a steady state for the arc from the current location to the
 + * target block.  Therefore there is no need to process the target block again.
 + *
 + * The steps of "process a block, create state for target block(s), pick a new
 + * target block, merge state for target block, process target block" will
 + * continue until all the state changes have propogated all the way down the
 + * basic block tree, including round any cycles in the tree.  The merge step
 + * only deletes tracking entries from the input state(s), it never adds a
 + * tracking entry.  Therefore the overall algorithm is guaranteed to converge
 + * to a steady state, the worst possible case is that every tracking entry into
 + * a block is deleted, which will result in an empty output state.
 + *
 + * As each instruction is decoded, it is checked to see if this is the point at
 + * which execution left this function.  This can be a call to another function
 + * (actually the return address to this function) or is the instruction which
 + * was about to be executed when an interrupt occurred (including an oops).
 + * Save the register state at this point.
 + *
 + * We always know what the registers contain when execution left this function.
 + * For an interrupt, the registers are in struct pt_regs.  For a call to
 + * another function, we have already deduced the register state on entry to the
 + * other function by unwinding to the start of that function.  Given the
 + * register state on exit from this function plus the known register contents
 + * on entry to the next function, we can determine the stack pointer value on
 + * input to this function.  That in turn lets us calculate the address of input
 + * registers that have been stored on stack, giving us the input parameters.
 + * Finally the stack pointer gives us the return address which is the exit
 + * point from the calling function, repeat the unwind process on that function.
 + *
 + * The data that tracks which registers contain input parameters is function
 + * global, not local to any basic block.  To determine which input registers
 + * contain parameters, we have to decode the entire function.  Otherwise an
 + * exit early in the function might not have read any parameters yet.
 + */
 +
 +/* Record memory contents in terms of the values that were passed to this
 + * function, IOW track which memory locations contain an input value.  A memory
 + * location's contents can be undefined, it can contain an input register value
 + * or it can contain an offset from the original stack pointer.
 + *
 + * This structure is used to record register contents that have been stored in
 + * memory.  Location (BBRG_OSP + 'offset_address') contains the input value
 + * from register 'value'.  When 'value' is BBRG_OSP then offset_value contains
 + * the offset from the original stack pointer that was stored in this memory
 + * location.  When 'value' is not BBRG_OSP then the memory location contains
 + * the original contents of an input register and offset_value is ignored.
 + *
 + * An input register 'value' can be stored in more than one register and/or in
 + * more than one memory location.
 + */
 +
 +struct bb_memory_contains
 +{
 +      short offset_address;
 +      enum bb_reg_code value: 8;
 +      short offset_value;
 +};
 +
 +/* Track the register state in each basic block. */
 +
 +struct bb_reg_state
 +{
 +      /* Indexed by register value 'reg - BBRG_RAX' */
 +      struct bb_reg_contains contains[KDB_INT_REGISTERS];
 +      int ref_count;
 +      int mem_count;
 +      /* dynamic size for memory locations, see mem_count */
 +      struct bb_memory_contains memory[0];
 +};
 +
 +static struct bb_reg_state *bb_reg_state, *bb_exit_state;
 +static int bb_reg_state_max, bb_reg_params, bb_memory_params;
 +
 +struct bb_actual
 +{
 +      bfd_vma value;
 +      int valid;
 +};
 +
 +/* Contains the actual hex value of a register, plus a valid bit.  Indexed by
 + * register value 'reg - BBRG_RAX'
 + */
 +static struct bb_actual bb_actual[KDB_INT_REGISTERS];
 +
 +static bfd_vma bb_func_start, bb_func_end;
 +static bfd_vma bb_common_interrupt, bb_error_entry, bb_ret_from_intr,
 +             bb_thread_return, bb_sync_regs, bb_save_v86_state,
 +             bb__sched_text_start, bb__sched_text_end,
 +             bb_save_args, bb_save_rest, bb_save_paranoid;
 +
 +/* Record jmp instructions, both conditional and unconditional.  These form the
 + * arcs between the basic blocks.  This is also used to record the state when
 + * one block drops through into the next.
 + *
 + * A bb can have multiple associated bb_jmp entries, one for each jcc
 + * instruction plus at most one bb_jmp for the drop through case.  If a bb
 + * drops through to the next bb then the drop through bb_jmp entry will be the
 + * last entry in the set of bb_jmp's that are associated with the bb.  This is
 + * enforced by the fact that jcc entries are added during the disassembly phase
 + * of pass 1, the drop through entries are added near the end of pass 1.
 + *
 + * At address 'from' in this block, we have a jump to address 'to'.  The
 + * register state at 'from' is copied to the target block.
 + */
 +
 +struct bb_jmp
 +{
 +      bfd_vma from;
 +      bfd_vma to;
 +      struct bb_reg_state *state;
 +      unsigned int drop_through: 1;
 +};
 +
 +struct bb
 +{
 +      bfd_vma start;
 +      /* The end address of a basic block is sloppy.  It can be the first
 +       * byte of the last instruction in the block or it can be the last byte
 +       * of the block.
 +       */
 +      bfd_vma end;
 +      unsigned int changed: 1;
 +      unsigned int drop_through: 1;
 +};
 +
 +static struct bb **bb_list, *bb_curr;
 +static int bb_max, bb_count;
 +
 +static struct bb_jmp *bb_jmp_list;
 +static int bb_jmp_max, bb_jmp_count;
 +
 +/* Add a new bb entry to the list.  This does an insert sort. */
 +
 +static struct bb *
 +bb_new(bfd_vma order)
 +{
 +      int i, j;
 +      struct bb *bb, *p;
 +      if (bb_giveup)
 +              return NULL;
 +      if (bb_count == bb_max) {
 +              struct bb **bb_list_new;
 +              bb_max += 10;
 +              bb_list_new = debug_kmalloc(bb_max*sizeof(*bb_list_new),
 +                                          GFP_ATOMIC);
 +              if (!bb_list_new) {
 +                      kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
 +                      bb_giveup = 1;
 +                      return NULL;
 +              }
 +              memcpy(bb_list_new, bb_list, bb_count*sizeof(*bb_list));
 +              debug_kfree(bb_list);
 +              bb_list = bb_list_new;
 +      }
 +      bb = debug_kmalloc(sizeof(*bb), GFP_ATOMIC);
 +      if (!bb) {
 +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
 +              bb_giveup = 1;
 +              return NULL;
 +      }
 +      memset(bb, 0, sizeof(*bb));
 +      for (i = 0; i < bb_count; ++i) {
 +              p = bb_list[i];
 +              if ((p->start && p->start > order) ||
 +                  (p->end && p->end > order))
 +                      break;
 +      }
 +      for (j = bb_count-1; j >= i; --j)
 +              bb_list[j+1] = bb_list[j];
 +      bb_list[i] = bb;
 +      ++bb_count;
 +      return bb;
 +}
 +
 +/* Add a new bb_jmp entry to the list.  This list is not sorted. */
 +
 +static struct bb_jmp *
 +bb_jmp_new(bfd_vma from, bfd_vma to, unsigned int drop_through)
 +{
 +      struct bb_jmp *bb_jmp;
 +      if (bb_giveup)
 +              return NULL;
 +      if (bb_jmp_count == bb_jmp_max) {
 +              struct bb_jmp *bb_jmp_list_new;
 +              bb_jmp_max += 10;
 +              bb_jmp_list_new =
 +                      debug_kmalloc(bb_jmp_max*sizeof(*bb_jmp_list_new),
 +                                    GFP_ATOMIC);
 +              if (!bb_jmp_list_new) {
 +                      kdb_printf("\n\n%s: out of debug_kmalloc\n",
 +                                 __FUNCTION__);
 +                      bb_giveup = 1;
 +                      return NULL;
 +              }
 +              memcpy(bb_jmp_list_new, bb_jmp_list,
 +                     bb_jmp_count*sizeof(*bb_jmp_list));
 +              debug_kfree(bb_jmp_list);
 +              bb_jmp_list = bb_jmp_list_new;
 +      }
 +      bb_jmp = bb_jmp_list + bb_jmp_count++;
 +      bb_jmp->from = from;
 +      bb_jmp->to = to;
 +      bb_jmp->drop_through = drop_through;
 +      bb_jmp->state = NULL;
 +      return bb_jmp;
 +}
 +
 +static void
 +bb_delete(int i)
 +{
 +      struct bb *bb = bb_list[i];
 +      memcpy(bb_list+i, bb_list+i+1, (bb_count-i-1)*sizeof(*bb_list));
 +      bb_list[--bb_count] = NULL;
 +      debug_kfree(bb);
 +}
 +
 +static struct bb *
 +bb_add(bfd_vma start, bfd_vma end)
 +{
 +      int i;
 +      struct bb *bb;
 +      /* Ignore basic blocks whose start address is outside the current
 +       * function.  These occur for call instructions and for tail recursion.
 +       */
 +      if (start &&
 +          (start < bb_func_start || start >= bb_func_end))
 +                     return NULL;
 +      for (i = 0; i < bb_count; ++i) {
 +              bb = bb_list[i];
 +              if ((start && bb->start == start) ||
 +                  (end && bb->end == end))
 +                      return bb;
 +      }
 +      bb = bb_new(start ? start : end);
 +      if (bb) {
 +              bb->start = start;
 +              bb->end = end;
 +      }
 +      return bb;
 +}
 +
 +static struct bb_jmp *
 +bb_jmp_add(bfd_vma from, bfd_vma to, unsigned int drop_through)
 +{
 +      int i;
 +      struct bb_jmp *bb_jmp;
 +      for (i = 0, bb_jmp = bb_jmp_list; i < bb_jmp_count; ++i, ++bb_jmp) {
 +              if (bb_jmp->from == from &&
 +                  bb_jmp->to == to &&
 +                  bb_jmp->drop_through == drop_through)
 +                      return bb_jmp;
 +      }
 +      bb_jmp = bb_jmp_new(from, to, drop_through);
 +      return bb_jmp;
 +}
 +
 +static unsigned long bb_curr_addr, bb_exit_addr;
 +static char bb_buffer[256];   /* A bit too big to go on stack */
 +
 +/* Computed jmp uses 'jmp *addr(,%reg,[48])' where 'addr' is the start of a
 + * table of addresses that point into the current function.  Run the table and
 + * generate bb starts for each target address plus a bb_jmp from this address
 + * to the target address.
 + *
 + * Only called for 'jmp' instructions, with the pointer starting at 'jmp'.
 + */
 +
 +static void
 +bb_pass1_computed_jmp(char *p)
 +{
 +      unsigned long table, scale;
 +      kdb_machreg_t addr;
 +      struct bb* bb;
 +      p += strcspn(p, " \t");         /* end of instruction */
 +      p += strspn(p, " \t");          /* start of address */
 +      if (*p++ != '*')
 +              return;
 +      table = simple_strtoul(p, &p, 0);
 +      if (strncmp(p, "(,%", 3) != 0)
 +              return;
 +      p += 3;
 +      p += strcspn(p, ",");           /* end of reg */
 +      if (*p++ != ',')
 +              return;
 +      scale = simple_strtoul(p, &p, 0);
 +      if (scale != KDB_WORD_SIZE || strcmp(p, ")"))
 +              return;
 +      while (!bb_giveup) {
 +              if (kdb_getword(&addr, table, sizeof(addr)))
 +                      return;
 +              if (addr < bb_func_start || addr >= bb_func_end)
 +                      return;
 +              bb = bb_add(addr, 0);
 +              if (bb)
 +                      bb_jmp_add(bb_curr_addr, addr, 0);
 +              table += KDB_WORD_SIZE;
 +      }
 +}
 +
 +/* Pass 1, identify the start and end of each basic block */
 +
 +static int
 +bb_dis_pass1(PTR file, const char *fmt, ...)
 +{
 +      int l = strlen(bb_buffer);
 +      char *p;
 +      va_list ap;
 +      va_start(ap, fmt);
 +      vsnprintf(bb_buffer + l, sizeof(bb_buffer) - l, fmt, ap);
 +      va_end(ap);
 +      if ((p = strchr(bb_buffer, '\n'))) {
 +              *p = '\0';
 +              /* ret[q], iret[q], sysexit, sysret, ud2a or jmp[q] end a
 +               * block.  As does a call to a function marked noret.
 +               */
 +              p = bb_buffer;
 +              p += strcspn(p, ":");
 +              if (*p++ == ':') {
 +                      bb_fixup_switch_to(p);
 +                      p += strspn(p, " \t");  /* start of instruction */
 +                      if (strncmp(p, "ret", 3) == 0 ||
 +                          strncmp(p, "iret", 4) == 0 ||
 +                          strncmp(p, "sysexit", 7) == 0 ||
 +                          strncmp(p, "sysret", 6) == 0 ||
 +                          strncmp(p, "ud2a", 4) == 0 ||
 +                          strncmp(p, "jmp", 3) == 0) {
 +                              if (strncmp(p, "jmp", 3) == 0)
 +                                      bb_pass1_computed_jmp(p);
 +                              bb_add(0, bb_curr_addr);
 +                      };
 +                      if (strncmp(p, "call", 4) == 0) {
 +                              strsep(&p, " \t");      /* end of opcode */
 +                              if (p)
 +                                      p += strspn(p, " \t");  /* operand(s) */
 +                              if (p && strchr(p, '<')) {
 +                                      p = strchr(p, '<') + 1;
 +                                      *strchr(p, '>') = '\0';
 +                                      if (bb_noret(p))
 +                                              bb_add(0, bb_curr_addr);
 +                              }
 +                      };
 +              }
 +              bb_buffer[0] = '\0';
 +      }
 +      return 0;
 +}
 +
 +static void
 +bb_printaddr_pass1(bfd_vma addr, disassemble_info *dip)
 +{
 +      kdb_symtab_t symtab;
 +      unsigned int offset;
 +      struct bb* bb;
 +      /* disasm only calls the printaddr routine for the target of jmp, loop
 +       * or call instructions, i.e. the start of a basic block.  call is
 +       * ignored by bb_add because the target address is outside the current
 +       * function.
 +       */
 +      dip->fprintf_func(dip->stream, "0x%lx", addr);
 +      kdbnearsym(addr, &symtab);
 +      if (symtab.sym_name) {
 +              dip->fprintf_func(dip->stream, " <%s", symtab.sym_name);
 +              if ((offset = addr - symtab.sym_start))
 +                      dip->fprintf_func(dip->stream, "+0x%x", offset);
 +              dip->fprintf_func(dip->stream, ">");
 +      }
 +      bb = bb_add(addr, 0);
 +      if (bb)
 +              bb_jmp_add(bb_curr_addr, addr, 0);
 +}
 +
 +static void
 +bb_pass1(void)
 +{
 +      int i;
 +      unsigned long addr;
 +      struct bb *bb;
 +      struct bb_jmp *bb_jmp;
 +
 +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
 +              kdb_printf("%s: func_name %s func_start " kdb_bfd_vma_fmt0
 +                         " func_end " kdb_bfd_vma_fmt0 "\n",
 +                         __FUNCTION__,
 +                         bb_func_name,
 +                         bb_func_start,
 +                         bb_func_end);
 +      kdb_di.fprintf_func = bb_dis_pass1;
 +      kdb_di.print_address_func = bb_printaddr_pass1;
 +
 +      bb_add(bb_func_start, 0);
 +      for (bb_curr_addr = bb_func_start;
 +           bb_curr_addr < bb_func_end;
 +           ++bb_curr_addr) {
 +              unsigned char c;
 +              if (kdb_getarea(c, bb_curr_addr)) {
 +                      kdb_printf("%s: unreadable function code at ",
 +                                 __FUNCTION__);
 +                      kdb_symbol_print(bb_curr_addr, NULL, KDB_SP_DEFAULT);
 +                      kdb_printf(", giving up\n");
 +                      bb_giveup = 1;
 +                      return;
 +              }
 +      }
 +      for (addr = bb_func_start; addr < bb_func_end; ) {
 +              bb_curr_addr = addr;
 +              addr += kdba_id_printinsn(addr, &kdb_di);
 +              kdb_di.fprintf_func(NULL, "\n");
 +      }
 +      if (bb_giveup)
 +              goto out;
 +
 +      /* Special case: a block consisting of a single instruction which is
 +       * both the target of a jmp and is also an ending instruction, so we
 +       * add two blocks using the same address, one as a start and one as an
 +       * end, in no guaranteed order.  The end must be ordered after the
 +       * start.
 +       */
 +      for (i = 0; i < bb_count-1; ++i) {
 +              struct bb *bb1 = bb_list[i], *bb2 = bb_list[i+1];
 +              if (bb1->end && bb1->end == bb2->start) {
 +                      bb = bb_list[i+1];
 +                      bb_list[i+1] = bb_list[i];
 +                      bb_list[i] = bb;
 +              }
 +      }
 +
 +      /* Some bb have a start address, some have an end address.  Collapse
 +       * them into entries that have both start and end addresses.  The first
 +       * entry is guaranteed to have a start address.
 +       */
 +      for (i = 0; i < bb_count-1; ++i) {
 +              struct bb *bb1 = bb_list[i], *bb2 = bb_list[i+1];
 +              if (bb1->end)
 +                      continue;
 +              if (bb2->start) {
 +                      bb1->end = bb2->start - 1;
 +                      bb1->drop_through = 1;
 +                      bb_jmp_add(bb1->end, bb2->start, 1);
 +              } else {
 +                      bb1->end = bb2->end;
 +                      bb_delete(i+1);
 +              }
 +      }
 +      bb = bb_list[bb_count-1];
 +      if (!bb->end)
 +              bb->end = bb_func_end - 1;
 +
 +      /* It would be nice to check that all bb have a valid start and end
 +       * address but there is just too much garbage code in the kernel to do
 +       * that check.  Aligned functions in assembler code mean that there is
 +       * space between the end of one function and the start of the next and
 +       * that space contains previous code from the assembler's buffers.  It
 +       * looks like dead code with nothing that branches to it, so no start
 +       * address.  do_sys_vm86() ends with 'jmp resume_userspace' which the C
 +       * compiler does not know about so gcc appends the normal exit code,
 +       * again nothing branches to this dangling code.
 +       *
 +       * The best we can do is delete bb entries with no start address.
 +       */
 +      for (i = 0; i < bb_count; ++i) {
 +              struct bb *bb = bb_list[i];
 +              if (!bb->start)
 +                      bb_delete(i--);
 +      }
 +      for (i = 0; i < bb_count; ++i) {
 +              struct bb *bb = bb_list[i];
 +              if (!bb->end) {
 +                      kdb_printf("%s: incomplete bb state\n", __FUNCTION__);
 +                      bb_giveup = 1;
 +                      goto debug;
 +              }
 +      }
 +
 +out:
 +      if (!KDB_DEBUG(BB))
 +              return;
 +debug:
 +      kdb_printf("%s: end\n", __FUNCTION__);
 +      for (i = 0; i < bb_count; ++i) {
 +              bb = bb_list[i];
 +              kdb_printf("  bb[%d] start "
 +                         kdb_bfd_vma_fmt0
 +                         " end " kdb_bfd_vma_fmt0
 +                         " drop_through %d",
 +                         i, bb->start, bb->end, bb->drop_through);
 +              kdb_printf("\n");
 +      }
 +      for (i = 0; i < bb_jmp_count; ++i) {
 +              bb_jmp = bb_jmp_list + i;
 +              kdb_printf("  bb_jmp[%d] from "
 +                         kdb_bfd_vma_fmt0
 +                         " to " kdb_bfd_vma_fmt0
 +                         " drop_through %d\n",
 +                         i, bb_jmp->from, bb_jmp->to, bb_jmp->drop_through);
 +      }
 +}
 +
 +/* Pass 2, record register changes in each basic block */
 +
 +/* For each opcode that we care about, indicate how it uses its operands.  Most
 + * opcodes can be handled generically because they completely specify their
 + * operands in the instruction, however many opcodes have side effects such as
 + * reading or writing rax or updating rsp.  Instructions that change registers
 + * that are not listed in the operands must be handled as special cases.  In
 + * addition, instructions that copy registers while preserving their contents
 + * (push, pop, mov) or change the contents in a well defined way (add with an
 + * immediate, lea) must be handled as special cases in order to track the
 + * register contents.
 + *
 + * The tables below only list opcodes that are actually used in the Linux
 + * kernel, so they omit most of the floating point and all of the SSE type
 + * instructions.  The operand usage entries only cater for accesses to memory
 + * and to the integer registers, accesses to floating point registers and flags
 + * are not relevant for kernel backtraces.
 + */
 +
 +enum bb_operand_usage {
 +      BBOU_UNKNOWN = 0,
 +              /* generic entries.  because xchg can do any combinations of
 +               * read src, write src, read dst and  write dst we need to
 +               * define all 16 possibilities.  These are ordered by rs = 1,
 +               * rd = 2, ws = 4, wd = 8, bb_usage_x*() functions rely on this
 +               * order.
 +               */
 +      BBOU_RS = 1,    /* read src */          /*  1 */
 +      BBOU_RD,        /* read dst */          /*  2 */
 +      BBOU_RSRD,                              /*  3 */
 +      BBOU_WS,        /* write src */         /*  4 */
 +      BBOU_RSWS,                              /*  5 */
 +      BBOU_RDWS,                              /*  6 */
 +      BBOU_RSRDWS,                            /*  7 */
 +      BBOU_WD,        /* write dst */         /*  8 */
 +      BBOU_RSWD,                              /*  9 */
 +      BBOU_RDWD,                              /* 10 */
 +      BBOU_RSRDWD,                            /* 11 */
 +      BBOU_WSWD,                              /* 12 */
 +      BBOU_RSWSWD,                            /* 13 */
 +      BBOU_RDWSWD,                            /* 14 */
 +      BBOU_RSRDWSWD,                          /* 15 */
 +              /* opcode specific entries */
 +      BBOU_ADD,
 +      BBOU_AND,
 +      BBOU_CALL,
 +      BBOU_CBW,
 +      BBOU_CMOV,
 +      BBOU_CMPXCHG,
 +      BBOU_CMPXCHGD,
 +      BBOU_CPUID,
 +      BBOU_CWD,
 +      BBOU_DIV,
 +      BBOU_IDIV,
 +      BBOU_IMUL,
 +      BBOU_IRET,
 +      BBOU_JMP,
 +      BBOU_LAHF,
 +      BBOU_LEA,
 +      BBOU_LEAVE,
 +      BBOU_LODS,
 +      BBOU_LOOP,
 +      BBOU_LSS,
 +      BBOU_MONITOR,
 +      BBOU_MOV,
 +      BBOU_MOVS,
 +      BBOU_MUL,
 +      BBOU_MWAIT,
 +      BBOU_NOP,
 +      BBOU_OUTS,
 +      BBOU_POP,
 +      BBOU_POPF,
 +      BBOU_PUSH,
 +      BBOU_PUSHF,
 +      BBOU_RDMSR,
 +      BBOU_RDTSC,
 +      BBOU_RET,
 +      BBOU_SAHF,
 +      BBOU_SCAS,
 +      BBOU_SUB,
 +      BBOU_SYSEXIT,
 +      BBOU_SYSRET,
 +      BBOU_WRMSR,
 +      BBOU_XADD,
 +      BBOU_XCHG,
 +      BBOU_XOR,
 +};
 +
 +struct bb_opcode_usage {
 +      int length;
 +      enum bb_operand_usage usage;
 +      const char *opcode;
 +};
 +
 +/* This table is sorted in alphabetical order of opcode, except that the
 + * trailing '"' is treated as a high value.  For example, 'in' sorts after
 + * 'inc', 'bt' after 'btc'.  This modified sort order ensures that shorter
 + * opcodes come after long ones.  A normal sort would put 'in' first, so 'in'
 + * would match both 'inc' and 'in'.  When adding any new entries to this table,
 + * be careful to put shorter entries last in their group.
 + *
 + * To automatically sort the table (in vi)
 + *   Mark the first and last opcode line with 'a and 'b
 + *   'a
 + *   !'bsed -e 's/"}/}}/' | LANG=C sort -t '"' -k2 | sed -e 's/}}/"}/'
 + *
 + * If a new instruction has to be added, first consider if it affects registers
 + * other than those listed in the operands.  Also consider if you want to track
 + * the results of issuing the instruction, IOW can you extract useful
 + * information by looking in detail at the modified registers or memory.  If
 + * either test is true then you need a special case to handle the instruction.
 + *
 + * The generic entries at the start of enum bb_operand_usage all have one thing
 + * in common, if a register or memory location is updated then that location
 + * becomes undefined, i.e. we lose track of anything that was previously saved
 + * in that location.  So only use a generic BBOU_* value when the result of the
 + * instruction cannot be calculated exactly _and_ when all the affected
 + * registers are listed in the operands.
 + *
 + * Examples:
 + *
 + * 'call' does not generate a known result, but as a side effect of call,
 + * several scratch registers become undefined, so it needs a special BBOU_CALL
 + * entry.
 + *
 + * 'adc' generates a variable result, it depends on the carry flag, so 'adc'
 + * gets a generic entry.  'add' can generate an exact result (add with
 + * immediate on a register that points to the stack) or it can generate an
 + * unknown result (add a variable, or add immediate to a register that does not
 + * contain a stack pointer) so 'add' has its own BBOU_ADD entry.
 + */
 +
 +static const struct bb_opcode_usage
 +bb_opcode_usage_all[] = {
 +      {3, BBOU_RSRDWD,  "adc"},
 +      {3, BBOU_ADD,     "add"},
 +      {3, BBOU_AND,     "and"},
 +      {3, BBOU_RSWD,    "bsf"},
 +      {3, BBOU_RSWD,    "bsr"},
 +      {5, BBOU_RSWS,    "bswap"},
 +      {3, BBOU_RSRDWD,  "btc"},
 +      {3, BBOU_RSRDWD,  "btr"},
 +      {3, BBOU_RSRDWD,  "bts"},
 +      {2, BBOU_RSRD,    "bt"},
 +      {4, BBOU_CALL,    "call"},
 +      {4, BBOU_CBW,     "cbtw"},      /* Intel cbw */
 +      {3, BBOU_NOP,     "clc"},
 +      {3, BBOU_NOP,     "cld"},
 +      {7, BBOU_RS,      "clflush"},
 +      {4, BBOU_NOP,     "clgi"},
 +      {3, BBOU_NOP,     "cli"},
 +      {4, BBOU_CWD,     "cltd"},      /* Intel cdq */
 +      {4, BBOU_CBW,     "cltq"},      /* Intel cdqe */
 +      {4, BBOU_NOP,     "clts"},
 +      {4, BBOU_CMOV,    "cmov"},
 +      {9, BBOU_CMPXCHGD,"cmpxchg16"},
 +      {8, BBOU_CMPXCHGD,"cmpxchg8"},
 +      {7, BBOU_CMPXCHG, "cmpxchg"},
 +      {3, BBOU_RSRD,    "cmp"},
 +      {5, BBOU_CPUID,   "cpuid"},
 +      {4, BBOU_CWD,     "cqto"},      /* Intel cdo */
 +      {4, BBOU_CWD,     "cwtd"},      /* Intel cwd */
 +      {4, BBOU_CBW,     "cwtl"},      /* Intel cwde */
 +      {4, BBOU_NOP,     "data"},      /* alternative ASM_NOP<n> generates data16 on x86_64 */
 +      {3, BBOU_RSWS,    "dec"},
 +      {3, BBOU_DIV,     "div"},
 +      {5, BBOU_RS,      "fdivl"},
 +      {5, BBOU_NOP,     "finit"},
 +      {6, BBOU_RS,      "fistpl"},
 +      {4, BBOU_RS,      "fldl"},
 +      {4, BBOU_RS,      "fmul"},
 +      {6, BBOU_NOP,     "fnclex"},
 +      {6, BBOU_NOP,     "fninit"},
 +      {6, BBOU_RS,      "fnsave"},
 +      {7, BBOU_NOP,     "fnsetpm"},
 +      {6, BBOU_RS,      "frstor"},
 +      {5, BBOU_WS,      "fstsw"},
 +      {5, BBOU_RS,      "fsubp"},
 +      {5, BBOU_NOP,     "fwait"},
 +      {7, BBOU_RS,      "fxrstor"},
 +      {6, BBOU_RS,      "fxsave"},
 +      {3, BBOU_NOP,     "hlt"},
 +      {4, BBOU_IDIV,    "idiv"},
 +      {4, BBOU_IMUL,    "imul"},
 +      {3, BBOU_RSWS,    "inc"},
 +      {3, BBOU_NOP,     "int"},
 +      {7, BBOU_RSRD,    "invlpga"},
 +      {6, BBOU_RS,      "invlpg"},
 +      {2, BBOU_RSWD,    "in"},
 +      {4, BBOU_IRET,    "iret"},
 +      {1, BBOU_JMP,     "j"},
 +      {4, BBOU_LAHF,    "lahf"},
 +      {3, BBOU_RSWD,    "lar"},
 +      {5, BBOU_RS,      "lcall"},
 +      {5, BBOU_LEAVE,   "leave"},
 +      {3, BBOU_LEA,     "lea"},
 +      {6, BBOU_NOP,     "lfence"},
 +      {4, BBOU_RS,      "lgdt"},
 +      {4, BBOU_RS,      "lidt"},
 +      {4, BBOU_RS,      "ljmp"},
 +      {4, BBOU_RS,      "lldt"},
 +      {4, BBOU_RS,      "lmsw"},
 +      {4, BBOU_LODS,    "lods"},
 +      {4, BBOU_LOOP,    "loop"},
 +      {4, BBOU_NOP,     "lret"},
 +      {3, BBOU_RSWD,    "lsl"},
 +      {3, BBOU_LSS,     "lss"},
 +      {3, BBOU_RS,      "ltr"},
 +      {6, BBOU_NOP,     "mfence"},
 +      {7, BBOU_MONITOR, "monitor"},
 +      {4, BBOU_MOVS,    "movs"},
 +      {3, BBOU_MOV,     "mov"},
 +      {3, BBOU_MUL,     "mul"},
 +      {5, BBOU_MWAIT,   "mwait"},
 +      {3, BBOU_RSWS,    "neg"},
 +      {3, BBOU_NOP,     "nop"},
 +      {3, BBOU_RSWS,    "not"},
 +      {2, BBOU_RSRDWD,  "or"},
 +      {4, BBOU_OUTS,    "outs"},
 +      {3, BBOU_RSRD,    "out"},
 +      {5, BBOU_NOP,     "pause"},
 +      {4, BBOU_POPF,    "popf"},
 +      {3, BBOU_POP,     "pop"},
 +      {8, BBOU_RS,      "prefetch"},
 +      {5, BBOU_PUSHF,   "pushf"},
 +      {4, BBOU_PUSH,    "push"},
 +      {3, BBOU_RSRDWD,  "rcl"},
 +      {3, BBOU_RSRDWD,  "rcr"},
 +      {5, BBOU_RDMSR,   "rdmsr"},
 +      {5, BBOU_RDMSR,   "rdpmc"},     /* same side effects as rdmsr */
 +      {5, BBOU_RDTSC,   "rdtsc"},
 +      {3, BBOU_RET,     "ret"},
 +      {3, BBOU_RSRDWD,  "rol"},
 +      {3, BBOU_RSRDWD,  "ror"},
 +      {4, BBOU_SAHF,    "sahf"},
 +      {3, BBOU_RSRDWD,  "sar"},
 +      {3, BBOU_RSRDWD,  "sbb"},
 +      {4, BBOU_SCAS,    "scas"},
 +      {3, BBOU_WS,      "set"},
 +      {6, BBOU_NOP,     "sfence"},
 +      {4, BBOU_WS,      "sgdt"},
 +      {3, BBOU_RSRDWD,  "shl"},
 +      {3, BBOU_RSRDWD,  "shr"},
 +      {4, BBOU_WS,      "sidt"},
 +      {4, BBOU_WS,      "sldt"},
 +      {3, BBOU_NOP,     "stc"},
 +      {3, BBOU_NOP,     "std"},
 +      {4, BBOU_NOP,     "stgi"},
 +      {3, BBOU_NOP,     "sti"},
 +      {4, BBOU_SCAS,    "stos"},
 +      {4, BBOU_WS,      "strl"},
 +      {3, BBOU_WS,      "str"},
 +      {3, BBOU_SUB,     "sub"},
 +      {6, BBOU_NOP,     "swapgs"},
 +      {7, BBOU_SYSEXIT, "sysexit"},
 +      {6, BBOU_SYSRET,  "sysret"},
 +      {4, BBOU_NOP,     "test"},
 +      {4, BBOU_NOP,     "ud2a"},
 +      {7, BBOU_RS,      "vmclear"},
 +      {8, BBOU_NOP,     "vmlaunch"},
 +      {6, BBOU_RS,      "vmload"},
 +      {7, BBOU_RS,      "vmptrld"},
 +      {6, BBOU_WD,      "vmread"},    /* vmread src is an encoding, not a register */
 +      {8, BBOU_NOP,     "vmresume"},
 +      {5, BBOU_RS,      "vmrun"},
 +      {6, BBOU_RS,      "vmsave"},
 +      {7, BBOU_WD,      "vmwrite"},   /* vmwrite src is an encoding, not a register */
 +      {3, BBOU_NOP,     "vmxoff"},
 +      {6, BBOU_NOP,     "wbinvd"},
 +      {5, BBOU_WRMSR,   "wrmsr"},
 +      {4, BBOU_XADD,    "xadd"},
 +      {4, BBOU_XCHG,    "xchg"},
 +      {3, BBOU_XOR,     "xor"},
 +      {4, BBOU_NOP,     "xrstor"},
 +      {4, BBOU_NOP,     "xsave"},
 +       {10, BBOU_WS,      "xstore-rng"},
 +};
 +
 +/* To speed up searching, index bb_opcode_usage_all by the first letter of each
 + * opcode.
 + */
 +static struct {
 +      const struct bb_opcode_usage *opcode;
 +      int size;
 +} bb_opcode_usage[26];
 +
 +struct bb_operand {
 +      char *base;
 +      char *index;
 +      char *segment;
 +      long disp;
 +      unsigned int scale;
 +      enum bb_reg_code base_rc;               /* UNDEFINED or RAX through R15 */
 +      enum bb_reg_code index_rc;              /* UNDEFINED or RAX through R15 */
 +      unsigned int present            :1;
 +      unsigned int disp_present       :1;
 +      unsigned int indirect           :1;     /* must be combined with reg or memory */
 +      unsigned int immediate          :1;     /* exactly one of these 3 must be set */
 +      unsigned int reg                :1;
 +      unsigned int memory             :1;
 +};
 +
 +struct bb_decode {
 +      char *prefix;
 +      char *opcode;
 +      const struct bb_opcode_usage *match;
 +      struct bb_operand src;
 +      struct bb_operand dst;
 +      struct bb_operand dst2;
 +};
 +
 +static struct bb_decode bb_decode;
 +
 +static enum bb_reg_code
 +bb_reg_map(const char *reg)
 +{
 +      int lo, hi, c;
 +      const struct bb_reg_code_map *p;
 +      lo = 0;
 +      hi = ARRAY_SIZE(bb_reg_code_map) - 1;
 +      while (lo <= hi) {
 +              int mid = (hi + lo) / 2;
 +              p = bb_reg_code_map + mid;
 +              c = strcmp(p->name, reg+1);
 +              if (c == 0)
 +                      return p->reg;
 +              else if (c > 0)
 +                      hi = mid - 1;
 +              else
 +                      lo = mid + 1;
 +      }
 +      return BBRG_UNDEFINED;
 +}
 +
 +static void
 +bb_parse_operand(char *str, struct bb_operand *operand)
 +{
 +      char *p = str;
 +      int sign = 1;
 +      operand->present = 1;
 +      /* extract any segment prefix */
 +      if (p[0] == '%' && p[1] && p[2] == 's' && p[3] == ':') {
 +              operand->memory = 1;
 +              operand->segment = p;
 +              p[3] = '\0';
 +              p += 4;
 +      }
 +      /* extract displacement, base, index, scale */
 +      if (*p == '*') {
 +              /* jmp/call *disp(%reg), *%reg or *0xnnn */
 +              operand->indirect = 1;
 +              ++p;
 +      }
 +      if (*p == '-') {
 +              sign = -1;
 +              ++p;
 +      }
 +      if (*p == '$') {
 +              operand->immediate = 1;
 +              operand->disp_present = 1;
 +              operand->disp = simple_strtoul(p+1, &p, 0);
 +      } else if (isdigit(*p)) {
 +              operand->memory = 1;
 +              operand->disp_present = 1;
 +              operand->disp = simple_strtoul(p, &p, 0) * sign;
 +      }
 +      if (*p == '%') {
 +              operand->reg = 1;
 +              operand->base = p;
 +      } else if (*p == '(') {
 +              operand->memory = 1;
 +              operand->base = ++p;
 +              p += strcspn(p, ",)");
 +              if (p == operand->base)
 +                      operand->base = NULL;
 +              if (*p == ',') {
 +                      *p = '\0';
 +                      operand->index = ++p;
 +                      p += strcspn(p, ",)");
 +                      if (p == operand->index)
 +                              operand->index = NULL;
 +              }
 +              if (*p == ',') {
 +                      *p = '\0';
 +                      operand->scale = simple_strtoul(p+1, &p, 0);
 +              }
 +              *p = '\0';
 +      } else if (*p) {
 +              kdb_printf("%s: unexpected token '%c' after disp '%s'\n",
 +                         __FUNCTION__, *p, str);
 +              bb_giveup = 1;
 +      }
 +      if ((operand->immediate + operand->reg + operand->memory != 1) ||
 +          (operand->indirect && operand->immediate)) {
 +              kdb_printf("%s: incorrect decode '%s' N %d I %d R %d M %d\n",
 +                         __FUNCTION__, str,
 +                         operand->indirect, operand->immediate, operand->reg,
 +                         operand->memory);
 +              bb_giveup = 1;
 +      }
 +      if (operand->base)
 +              operand->base_rc = bb_reg_map(operand->base);
 +      if (operand->index)
 +              operand->index_rc = bb_reg_map(operand->index);
 +}
 +
 +static void
 +bb_print_operand(const char *type, const struct bb_operand *operand)
 +{
 +      if (!operand->present)
 +              return;
 +      kdb_printf("  %s %c%c: ",
 +                 type,
 +                 operand->indirect ? 'N' : ' ',
 +                 operand->immediate ? 'I' :
 +                   operand->reg ? 'R' :
 +                   operand->memory ? 'M' :
 +                   '?'
 +                 );
 +      if (operand->segment)
 +              kdb_printf("%s:", operand->segment);
 +      if (operand->immediate) {
 +              kdb_printf("$0x%lx", operand->disp);
 +      } else if (operand->reg) {
 +              if (operand->indirect)
 +                      kdb_printf("*");
 +              kdb_printf("%s", operand->base);
 +      } else if (operand->memory) {
 +              if (operand->indirect && (operand->base || operand->index))
 +                      kdb_printf("*");
 +              if (operand->disp_present) {
 +                      kdb_printf("0x%lx", operand->disp);
 +              }
 +              if (operand->base || operand->index || operand->scale) {
 +                      kdb_printf("(");
 +                      if (operand->base)
 +                              kdb_printf("%s", operand->base);
 +                      if (operand->index || operand->scale)
 +                              kdb_printf(",");
 +                      if (operand->index)
 +                              kdb_printf("%s", operand->index);
 +                      if (operand->scale)
 +                              kdb_printf(",%d", operand->scale);
 +                      kdb_printf(")");
 +              }
 +      }
 +      if (operand->base_rc)
 +              kdb_printf(" base_rc %d (%s)",
 +                         operand->base_rc, bbrg_name[operand->base_rc]);
 +      if (operand->index_rc)
 +              kdb_printf(" index_rc %d (%s)",
 +                         operand->index_rc,
 +                         bbrg_name[operand->index_rc]);
 +      kdb_printf("\n");
 +}
 +
 +static void
 +bb_print_opcode(void)
 +{
 +      const struct bb_opcode_usage *o = bb_decode.match;
 +      kdb_printf("  ");
 +      if (bb_decode.prefix)
 +              kdb_printf("%s ", bb_decode.prefix);
 +      kdb_printf("opcode '%s' matched by '%s', usage %d\n",
 +                 bb_decode.opcode, o->opcode, o->usage);
 +}
 +
 +static int
 +bb_parse_opcode(void)
 +{
 +      int c, i;
 +      const struct bb_opcode_usage *o;
 +      static int bb_parse_opcode_error_limit = 5;
 +      c = bb_decode.opcode[0] - 'a';
 +      if (c < 0 || c >= ARRAY_SIZE(bb_opcode_usage))
 +              goto nomatch;
 +      o = bb_opcode_usage[c].opcode;
 +      if (!o)
 +              goto nomatch;
 +      for (i = 0; i < bb_opcode_usage[c].size; ++i, ++o) {
 +              if (strncmp(bb_decode.opcode, o->opcode, o->length) == 0) {
 +                      bb_decode.match = o;
 +                      if (KDB_DEBUG(BB))
 +                              bb_print_opcode();
 +                      return 0;
 +              }
 +      }
 +nomatch:
 +      if (!bb_parse_opcode_error_limit)
 +              return 1;
 +      --bb_parse_opcode_error_limit;
 +      kdb_printf("%s: no match at [%s]%s " kdb_bfd_vma_fmt0 " - '%s'\n",
 +                 __FUNCTION__,
 +                 bb_mod_name, bb_func_name, bb_curr_addr,
 +                 bb_decode.opcode);
 +      return 1;
 +}
 +
 +static bool
 +bb_is_int_reg(enum bb_reg_code reg)
 +{
 +      return reg >= BBRG_RAX && reg < (BBRG_RAX + KDB_INT_REGISTERS);
 +}
 +
 +static bool
 +bb_is_simple_memory(const struct bb_operand *operand)
 +{
 +      return operand->memory &&
 +             bb_is_int_reg(operand->base_rc) &&
 +             !operand->index_rc &&
 +             operand->scale == 0 &&
 +             !operand->segment;
 +}
 +
 +static bool
 +bb_is_static_disp(const struct bb_operand *operand)
 +{
 +      return operand->memory &&
 +             !operand->base_rc &&
 +             !operand->index_rc &&
 +             operand->scale == 0 &&
 +             !operand->segment &&
 +             !operand->indirect;
 +}
 +
 +static enum bb_reg_code
 +bb_reg_code_value(enum bb_reg_code reg)
 +{
 +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
 +      return bb_reg_state->contains[reg - BBRG_RAX].value;
 +}
 +
 +static short
 +bb_reg_code_offset(enum bb_reg_code reg)
 +{
 +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
 +      return bb_reg_state->contains[reg - BBRG_RAX].offset;
 +}
 +
 +static void
 +bb_reg_code_set_value(enum bb_reg_code dst, enum bb_reg_code src)
 +{
 +      BB_CHECK(!bb_is_int_reg(dst), dst, );
 +      bb_reg_state->contains[dst - BBRG_RAX].value = src;
 +}
 +
 +static void
 +bb_reg_code_set_offset(enum bb_reg_code dst, short offset)
 +{
 +      BB_CHECK(!bb_is_int_reg(dst), dst, );
 +      bb_reg_state->contains[dst - BBRG_RAX].offset = offset;
 +}
 +
 +static bool
 +bb_is_osp_defined(enum bb_reg_code reg)
 +{
 +      if (bb_is_int_reg(reg))
 +              return bb_reg_code_value(reg) == BBRG_OSP;
 +      else
 +              return 0;
 +}
 +
 +static bfd_vma
 +bb_actual_value(enum bb_reg_code reg)
 +{
 +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
 +      return bb_actual[reg - BBRG_RAX].value;
 +}
 +
 +static int
 +bb_actual_valid(enum bb_reg_code reg)
 +{
 +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
 +      return bb_actual[reg - BBRG_RAX].valid;
 +}
 +
 +static void
 +bb_actual_set_value(enum bb_reg_code reg, bfd_vma value)
 +{
 +      BB_CHECK(!bb_is_int_reg(reg), reg, );
 +      bb_actual[reg - BBRG_RAX].value = value;
 +}
 +
 +static void
 +bb_actual_set_valid(enum bb_reg_code reg, int valid)
 +{
 +      BB_CHECK(!bb_is_int_reg(reg), reg, );
 +      bb_actual[reg - BBRG_RAX].valid = valid;
 +}
 +
 +/* The scheduler code switches RSP then does PUSH, it is not an error for RSP
 + * to be undefined in this area of the code.
 + */
 +static bool
 +bb_is_scheduler_address(void)
 +{
 +      return bb_curr_addr >= bb__sched_text_start &&
 +             bb_curr_addr < bb__sched_text_end;
 +}
 +
 +static void
 +bb_reg_read(enum bb_reg_code reg)
 +{
 +      int i, r = 0;
 +      if (!bb_is_int_reg(reg) ||
 +          bb_reg_code_value(reg) != reg)
 +              return;
 +      for (i = 0;
 +           i < min_t(unsigned int, REGPARM, ARRAY_SIZE(bb_param_reg));
 +           ++i) {
 +              if (reg == bb_param_reg[i]) {
 +                      r = i + 1;
 +                      break;
 +              }
 +      }
 +      bb_reg_params = max(bb_reg_params, r);
 +}
 +
 +static void
 +bb_do_reg_state_print(const struct bb_reg_state *s)
 +{
 +      int i, offset_address, offset_value;
 +      const struct bb_memory_contains *c;
 +      enum bb_reg_code value;
 +      kdb_printf("  bb_reg_state %p\n", s);
 +      for (i = 0; i < ARRAY_SIZE(s->contains); ++i) {
 +              value = s->contains[i].value;
 +              offset_value = s->contains[i].offset;
 +              kdb_printf("    %s = %s",
 +                         bbrg_name[i + BBRG_RAX], bbrg_name[value]);
 +              if (value == BBRG_OSP)
 +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset_value, "", "");
 +              kdb_printf("\n");
 +      }
 +      for (i = 0, c = s->memory; i < s->mem_count; ++i, ++c) {
 +              offset_address = c->offset_address;
 +              value = c->value;
 +              offset_value = c->offset_value;
 +              kdb_printf("    slot %d offset_address %c0x%x %s",
 +                         i,
 +                         offset_address >= 0 ? '+' : '-',
 +                         offset_address >= 0 ? offset_address : -offset_address,
 +                         bbrg_name[value]);
 +              if (value == BBRG_OSP)
 +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset_value, "", "");
 +              kdb_printf("\n");
 +      }
 +}
 +
 +static void
 +bb_reg_state_print(const struct bb_reg_state *s)
 +{
 +      if (KDB_DEBUG(BB))
 +              bb_do_reg_state_print(s);
 +}
 +
 +/* Set register 'dst' to contain the value from 'src'.  This includes reading
 + * from 'src' and writing to 'dst'.  The offset value is copied iff 'src'
 + * contains a stack pointer.
 + *
 + * Be very careful about the context here.  'dst' and 'src' reflect integer
 + * registers by name, _not_ by the value of their contents.  "mov %rax,%rsi"
 + * will call this function as bb_reg_set_reg(BBRG_RSI, BBRG_RAX), which
 + * reflects what the assembler code is doing.  However we need to track the
 + * _values_ in the registers, not their names.  IOW, we really care about "what
 + * value does rax contain when it is copied into rsi?", so we can record the
 + * fact that we now have two copies of that value, one in rax and one in rsi.
 + */
 +
 +static void
 +bb_reg_set_reg(enum bb_reg_code dst, enum bb_reg_code src)
 +{
 +      enum bb_reg_code src_value = BBRG_UNDEFINED;
 +      short offset_value = 0;
 +      KDB_DEBUG_BB("  %s = %s", bbrg_name[dst], bbrg_name[src]);
 +      if (bb_is_int_reg(src)) {
 +              bb_reg_read(src);
 +              src_value = bb_reg_code_value(src);
 +              KDB_DEBUG_BB(" (%s", bbrg_name[src_value]);
 +              if (bb_is_osp_defined(src)) {
 +                      offset_value = bb_reg_code_offset(src);
 +                      KDB_DEBUG_BB_OFFSET(offset_value, "", "");
 +              }
 +              KDB_DEBUG_BB(")");
 +      }
 +      if (bb_is_int_reg(dst)) {
 +              bb_reg_code_set_value(dst, src_value);
 +              bb_reg_code_set_offset(dst, offset_value);
 +      }
 +      KDB_DEBUG_BB("\n");
 +}
 +
 +static void
 +bb_reg_set_undef(enum bb_reg_code dst)
 +{
 +      bb_reg_set_reg(dst, BBRG_UNDEFINED);
 +}
 +
 +/* Delete any record of a stored register held in osp + 'offset' */
 +
 +static void
 +bb_delete_memory(short offset)
 +{
 +      int i;
 +      struct bb_memory_contains *c;
 +      for (i = 0, c = bb_reg_state->memory;
 +           i < bb_reg_state->mem_count;
 +           ++i, ++c) {
 +              if (c->offset_address == offset &&
 +                  c->value != BBRG_UNDEFINED) {
 +                      KDB_DEBUG_BB("  delete %s from ",
 +                                   bbrg_name[c->value]);
 +                      KDB_DEBUG_BB_OFFSET(offset, "osp", "");
 +                      KDB_DEBUG_BB(" slot %d\n",
 +                                   (int)(c - bb_reg_state->memory));
 +                      memset(c, BBRG_UNDEFINED, sizeof(*c));
 +                      if (i == bb_reg_state->mem_count - 1)
 +                              --bb_reg_state->mem_count;
 +              }
 +      }
 +}
 +
 +/* Set memory location *('dst' + 'offset_address') to contain the supplied
 + * value and offset.  'dst' is assumed to be a register that contains a stack
 + * pointer.
 + */
 +
 +static void
 +bb_memory_set_reg_value(enum bb_reg_code dst, short offset_address,
 +                      enum bb_reg_code value, short offset_value)
 +{
 +      int i;
 +      struct bb_memory_contains *c, *free = NULL;
 +      BB_CHECK(!bb_is_osp_defined(dst), dst, );
 +      KDB_DEBUG_BB("  *(%s", bbrg_name[dst]);
 +      KDB_DEBUG_BB_OFFSET(offset_address, "", "");
 +      offset_address += bb_reg_code_offset(dst);
 +      KDB_DEBUG_BB_OFFSET(offset_address, " osp", ") = ");
 +      KDB_DEBUG_BB("%s", bbrg_name[value]);
 +      if (value == BBRG_OSP)
 +              KDB_DEBUG_BB_OFFSET(offset_value, "", "");
 +      for (i = 0, c = bb_reg_state->memory;
 +           i < bb_reg_state_max;
 +           ++i, ++c) {
 +              if (c->offset_address == offset_address)
 +                      free = c;
 +              else if (c->value == BBRG_UNDEFINED && !free)
 +                      free = c;
 +      }
 +      if (!free) {
 +              struct bb_reg_state *new, *old = bb_reg_state;
 +              size_t old_size, new_size;
 +              int slot;
 +              old_size = sizeof(*old) + bb_reg_state_max *
 +                                sizeof(old->memory[0]);
 +              slot = bb_reg_state_max;
 +              bb_reg_state_max += 5;
 +              new_size = sizeof(*new) + bb_reg_state_max *
 +                                sizeof(new->memory[0]);
 +              new = debug_kmalloc(new_size, GFP_ATOMIC);
 +              if (!new) {
 +                      kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
 +                      bb_giveup = 1;
 +              } else {
 +                      memcpy(new, old, old_size);
 +                      memset((char *)new + old_size, BBRG_UNDEFINED,
 +                             new_size - old_size);
 +                      bb_reg_state = new;
 +                      debug_kfree(old);
 +                      free = bb_reg_state->memory + slot;
 +              }
 +      }
 +      if (free) {
 +              int slot = free - bb_reg_state->memory;
 +              free->offset_address = offset_address;
 +              free->value = value;
 +              free->offset_value = offset_value;
 +              KDB_DEBUG_BB(" slot %d", slot);
 +              bb_reg_state->mem_count = max(bb_reg_state->mem_count, slot+1);
 +      }
 +      KDB_DEBUG_BB("\n");
 +}
 +
 +/* Set memory location *('dst' + 'offset') to contain the value from register
 + * 'src'.  'dst' is assumed to be a register that contains a stack pointer.
 + * This differs from bb_memory_set_reg_value because it takes a src register
 + * which contains a value and possibly an offset, bb_memory_set_reg_value is
 + * passed the value and offset directly.
 + */
 +
 +static void
 +bb_memory_set_reg(enum bb_reg_code dst, enum bb_reg_code src,
 +                short offset_address)
 +{
 +      int offset_value;
 +      enum bb_reg_code value;
 +      BB_CHECK(!bb_is_osp_defined(dst), dst, );
 +      if (!bb_is_int_reg(src))
 +              return;
 +      value = bb_reg_code_value(src);
 +      if (value == BBRG_UNDEFINED) {
 +              bb_delete_memory(offset_address + bb_reg_code_offset(dst));
 +              return;
 +      }
 +      offset_value = bb_reg_code_offset(src);
 +      bb_reg_read(src);
 +      bb_memory_set_reg_value(dst, offset_address, value, offset_value);
 +}
 +
 +/* Set register 'dst' to contain the value from memory *('src' + offset_address).
 + * 'src' is assumed to be a register that contains a stack pointer.
 + */
 +
 +static void
 +bb_reg_set_memory(enum bb_reg_code dst, enum bb_reg_code src, short offset_address)
 +{
 +      int i, defined = 0;
 +      struct bb_memory_contains *s;
 +      BB_CHECK(!bb_is_osp_defined(src), src, );
 +      KDB_DEBUG_BB("  %s = *(%s",
 +                   bbrg_name[dst], bbrg_name[src]);
 +      KDB_DEBUG_BB_OFFSET(offset_address, "", ")");
 +      offset_address += bb_reg_code_offset(src);
 +      KDB_DEBUG_BB_OFFSET(offset_address, " (osp", ")");
 +      for (i = 0, s = bb_reg_state->memory;
 +           i < bb_reg_state->mem_count;
 +           ++i, ++s) {
 +              if (s->offset_address == offset_address && bb_is_int_reg(dst)) {
 +                      bb_reg_code_set_value(dst, s->value);
 +                      KDB_DEBUG_BB(" value %s", bbrg_name[s->value]);
 +                      if (s->value == BBRG_OSP) {
 +                              bb_reg_code_set_offset(dst, s->offset_value);
 +                              KDB_DEBUG_BB_OFFSET(s->offset_value, "", "");
 +                      } else {
 +                              bb_reg_code_set_offset(dst, 0);
 +                      }
 +                      defined = 1;
 +              }
 +      }
 +      if (!defined)
 +              bb_reg_set_reg(dst, BBRG_UNDEFINED);
 +      else
 +              KDB_DEBUG_BB("\n");
 +}
 +
 +/* A generic read from an operand. */
 +
 +static void
 +bb_read_operand(const struct bb_operand *operand)
 +{
 +      int m = 0;
 +      if (operand->base_rc)
 +              bb_reg_read(operand->base_rc);
 +      if (operand->index_rc)
 +              bb_reg_read(operand->index_rc);
 +      if (bb_is_simple_memory(operand) &&
 +          bb_is_osp_defined(operand->base_rc) &&
 +          bb_decode.match->usage != BBOU_LEA) {
 +              m = (bb_reg_code_offset(operand->base_rc) + operand->disp +
 +                   KDB_WORD_SIZE - 1) / KDB_WORD_SIZE;
 +              bb_memory_params = max(bb_memory_params, m);
 +      }
 +}
 +
 +/* A generic write to an operand, resulting in an undefined value in that
 + * location.  All well defined operands are handled separately, this function
 + * only handles the opcodes where the result is undefined.
 + */
 +
 +static void
 +bb_write_operand(const struct bb_operand *operand)
 +{
 +      enum bb_reg_code base_rc = operand->base_rc;
 +      if (operand->memory) {
 +              if (base_rc)
 +                      bb_reg_read(base_rc);
 +              if (operand->index_rc)
 +                      bb_reg_read(operand->index_rc);
 +      } else if (operand->reg && base_rc) {
 +              bb_reg_set_undef(base_rc);
 +      }
 +      if (bb_is_simple_memory(operand) && bb_is_osp_defined(base_rc)) {
 +              int offset;
 +              offset = bb_reg_code_offset(base_rc) + operand->disp;
 +              offset = ALIGN(offset - KDB_WORD_SIZE + 1, KDB_WORD_SIZE);
 +              bb_delete_memory(offset);
 +      }
 +}
 +
 +/* Adjust a register that contains a stack pointer */
 +
 +static void
 +bb_adjust_osp(enum bb_reg_code reg, int adjust)
 +{
 +      int offset = bb_reg_code_offset(reg), old_offset = offset;
 +      KDB_DEBUG_BB("  %s osp offset ", bbrg_name[reg]);
 +      KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(reg), "", " -> ");
 +      offset += adjust;
 +      bb_reg_code_set_offset(reg, offset);
 +      KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(reg), "", "\n");
 +      /* When RSP is adjusted upwards, it invalidates any memory
 +       * stored between the old and current stack offsets.
 +       */
 +      if (reg == BBRG_RSP) {
 +              while (old_offset < bb_reg_code_offset(reg)) {
 +                      bb_delete_memory(old_offset);
 +                      old_offset += KDB_WORD_SIZE;
 +              }
 +      }
 +}
 +
 +/* The current instruction adjusts a register that contains a stack pointer.
 + * Direction is 1 or -1, depending on whether the instruction is add/lea or
 + * sub.
 + */
 +
 +static void
 +bb_adjust_osp_instruction(int direction)
 +{
 +      enum bb_reg_code dst_reg = bb_decode.dst.base_rc;
 +      if (bb_decode.src.immediate ||
 +          bb_decode.match->usage == BBOU_LEA /* lea has its own checks */) {
 +              int adjust = direction * bb_decode.src.disp;
 +              bb_adjust_osp(dst_reg, adjust);
 +      } else {
 +              /* variable stack adjustment, osp offset is not well defined */
 +              KDB_DEBUG_BB("  %s osp offset ", bbrg_name[dst_reg]);
 +              KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(dst_reg), "", " -> undefined\n");
 +              bb_reg_code_set_value(dst_reg, BBRG_UNDEFINED);
 +              bb_reg_code_set_offset(dst_reg, 0);
 +      }
 +}
 +
 +/* Some instructions using memory have an explicit length suffix (b, w, l, q).
 + * The equivalent instructions using a register imply the length from the
 + * register name.  Deduce the operand length.
 + */
 +
 +static int
 +bb_operand_length(const struct bb_operand *operand, char opcode_suffix)
 +{
 +      int l = 0;
 +      switch (opcode_suffix) {
 +      case 'b':
 +              l = 8;
 +              break;
 +      case 'w':
 +              l = 16;
 +              break;
 +      case 'l':
 +              l = 32;
 +              break;
 +      case 'q':
 +              l = 64;
 +              break;
 +      }
 +      if (l == 0 && operand->reg) {
 +              switch (strlen(operand->base)) {
 +              case 3:
 +                      switch (operand->base[2]) {
 +                      case 'h':
 +                      case 'l':
 +                              l = 8;
 +                              break;
 +                      default:
 +                              l = 16;
 +                              break;
 +                      }
 +              case 4:
 +                      if (operand->base[1] == 'r')
 +                              l = 64;
 +                      else
 +                              l = 32;
 +                      break;
 +              }
 +      }
 +      return l;
 +}
 +
 +static int
 +bb_reg_state_size(const struct bb_reg_state *state)
 +{
 +      return sizeof(*state) +
 +             state->mem_count * sizeof(state->memory[0]);
 +}
 +
 +/* Canonicalize the current bb_reg_state so it can be compared against
 + * previously created states.  Sort the memory entries in descending order of
 + * offset_address (stack grows down).  Empty slots are moved to the end of the
 + * list and trimmed.
 + */
 +
 +static void
 +bb_reg_state_canonicalize(void)
 +{
 +      int i, order, changed;
 +      struct bb_memory_contains *p1, *p2, temp;
 +      do {
 +              changed = 0;
 +              for (i = 0, p1 = bb_reg_state->memory;
 +                   i < bb_reg_state->mem_count-1;
 +                   ++i, ++p1) {
 +                      p2 = p1 + 1;
 +                      if (p2->value == BBRG_UNDEFINED) {
 +                              order = 0;
 +                      } else if (p1->value == BBRG_UNDEFINED) {
 +                              order = 1;
 +                      } else if (p1->offset_address < p2->offset_address) {
 +                              order = 1;
 +                      } else if (p1->offset_address > p2->offset_address) {
 +                              order = -1;
 +                      } else {
 +                              order = 0;
 +                      }
 +                      if (order > 0) {
 +                              temp = *p2;
 +                              *p2 = *p1;
 +                              *p1 = temp;
 +                              changed = 1;
 +                      }
 +              }
 +      } while(changed);
 +      for (i = 0, p1 = bb_reg_state->memory;
 +           i < bb_reg_state_max;
 +           ++i, ++p1) {
 +              if (p1->value != BBRG_UNDEFINED)
 +                      bb_reg_state->mem_count = i + 1;
 +      }
 +      bb_reg_state_print(bb_reg_state);
 +}
 +
 +static int
 +bb_special_case(bfd_vma to)
 +{
 +      int i, j, rsp_offset, expect_offset, offset, errors = 0, max_errors = 40;
 +      enum bb_reg_code reg, expect_value, value;
 +      struct bb_name_state *r;
 +
 +      for (i = 0, r = bb_special_cases;
 +           i < ARRAY_SIZE(bb_special_cases);
 +           ++i, ++r) {
 +              if (to == r->address &&
 +                  (r->fname == NULL || strcmp(bb_func_name, r->fname) == 0))
 +                      goto match;
 +      }
 +      /* Some inline assembler code has jumps to .fixup sections which result
 +       * in out of line transfers with undefined state, ignore them.
 +       */
 +      if (strcmp(bb_func_name, "strnlen_user") == 0 ||
 +          strcmp(bb_func_name, "copy_from_user") == 0)
 +              return 1;
 +      return 0;
 +
 +match:
 +      /* Check the running registers match */
 +      for (reg = BBRG_RAX; reg < r->regs_size; ++reg) {
 +              expect_value = r->regs[reg].value;
 +              if (test_bit(expect_value, r->skip_regs.bits)) {
 +                      /* this regs entry is not defined for this label */
 +                      continue;
 +              }
 +              if (expect_value == BBRG_UNDEFINED)
 +                      continue;
 +              expect_offset = r->regs[reg].offset;
 +              value = bb_reg_code_value(reg);
 +              offset = bb_reg_code_offset(reg);
 +              if (expect_value == value &&
 +                  (value != BBRG_OSP || r->osp_offset == offset))
 +                      continue;
 +              kdb_printf("%s: Expected %s to contain %s",
 +                         __FUNCTION__,
 +                         bbrg_name[reg],
 +                         bbrg_name[expect_value]);
 +              if (r->osp_offset)
 +                      KDB_DEBUG_BB_OFFSET_PRINTF(r->osp_offset, "", "");
 +              kdb_printf(".  It actually contains %s", bbrg_name[value]);
 +              if (offset)
 +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset, "", "");
 +              kdb_printf("\n");
 +              ++errors;
 +              if (max_errors-- == 0)
 +                      goto fail;
 +      }
 +      /* Check that any memory data on stack matches */
 +      i = j = 0;
 +      while (i < bb_reg_state->mem_count &&
 +             j < r->mem_size) {
 +              expect_value = r->mem[j].value;
 +              if (test_bit(expect_value, r->skip_mem.bits) ||
 +                  expect_value == BBRG_UNDEFINED) {
 +                      /* this memory slot is not defined for this label */
 +                      ++j;
 +                      continue;
 +              }
 +              rsp_offset = bb_reg_state->memory[i].offset_address -
 +                      bb_reg_code_offset(BBRG_RSP);
 +              if (rsp_offset >
 +                  r->mem[j].offset_address) {
 +                      /* extra slots in memory are OK */
 +                      ++i;
 +              } else if (rsp_offset <
 +                         r->mem[j].offset_address) {
 +                      /* Required memory slot is missing */
 +                      kdb_printf("%s: Invalid bb_reg_state.memory, "
 +                                 "missing memory entry[%d] %s\n",
 +                         __FUNCTION__, j, bbrg_name[expect_value]);
 +                      ++errors;
 +                      if (max_errors-- == 0)
 +                              goto fail;
 +                      ++j;
 +              } else {
 +                      if (bb_reg_state->memory[i].offset_value ||
 +                          bb_reg_state->memory[i].value != expect_value) {
 +                              /* memory slot is present but contains wrong
 +                               * value.
 +                               */
 +                              kdb_printf("%s: Invalid bb_reg_state.memory, "
 +                                          "wrong value in slot %d, "
 +                                          "should be %s, it is %s\n",
 +                                 __FUNCTION__, i,
 +                                 bbrg_name[expect_value],
 +                                 bbrg_name[bb_reg_state->memory[i].value]);
 +                              ++errors;
 +                              if (max_errors-- == 0)
 +                                      goto fail;
 +                      }
 +                      ++i;
 +                      ++j;
 +              }
 +      }
 +      while (j < r->mem_size) {
 +              expect_value = r->mem[j].value;
 +              if (test_bit(expect_value, r->skip_mem.bits) ||
 +                  expect_value == BBRG_UNDEFINED)
 +                      ++j;
 +              else
 +                      break;
 +      }
 +      if (j != r->mem_size) {
 +              /* Hit end of memory before testing all the pt_reg slots */
 +              kdb_printf("%s: Invalid bb_reg_state.memory, "
 +                          "missing trailing entries\n",
 +                 __FUNCTION__);
 +              ++errors;
 +              if (max_errors-- == 0)
 +                      goto fail;
 +      }
 +      if (errors)
 +              goto fail;
 +      return 1;
 +fail:
 +      kdb_printf("%s: on transfer to %s\n", __FUNCTION__, r->name);
 +      bb_giveup = 1;
 +      return 1;
 +}
 +
 +/* Transfer of control to a label outside the current function.  If the
 + * transfer is to a known common code path then do a sanity check on the state
 + * at this point.
 + */
 +
 +static void
 +bb_sanity_check(int type)
 +{
 +      enum bb_reg_code expect, actual;
 +      int i, offset, error = 0;
 +
 +      for (i = 0; i < ARRAY_SIZE(bb_preserved_reg); ++i) {
 +              expect = bb_preserved_reg[i];
 +              actual = bb_reg_code_value(expect);
 +              offset = bb_reg_code_offset(expect);
 +              if (expect == actual)
 +                      continue;
 +              /* type == 1 is sysret/sysexit, ignore RSP */
 +              if (type && expect == BBRG_RSP)
 +                      continue;
 +              /* type == 1 is sysret/sysexit, ignore RBP for i386 */
 +              /* We used to have "#ifndef CONFIG_X86_64" for the type=1 RBP
 +               * test; however, x86_64 can run ia32 compatible mode and
 +               * hit this problem. Perform the following test anyway!
 +               */
 +              if (type && expect == BBRG_RBP)
 +                      continue;
 +              /* RSP should contain OSP+0.  Except for ptregscall_common and
 +               * ia32_ptregs_common, they get a partial pt_regs, fudge the
 +               * stack to make it a full pt_regs then reverse the effect on
 +               * exit, so the offset is -0x50 on exit.
 +               */
 +              if (expect == BBRG_RSP &&
 +                  bb_is_osp_defined(expect) &&
 +                  (offset == 0 ||
 +                   (offset == -0x50 &&
 +                    (strcmp(bb_func_name, "ptregscall_common") == 0 ||
 +                     strcmp(bb_func_name, "ia32_ptregs_common") == 0))))
 +                      continue;
 +              /* The put_user and save_paranoid functions are special.
 +               * %rbx gets clobbered */
 +              if (expect == BBRG_RBX &&
 +                      (strncmp(bb_func_name, "__put_user_", 11) == 0 ||
 +                       strcmp(bb_func_name, "save_paranoid") == 0))
 +                      continue;
 +              /* Ignore rbp and rsp for error_entry */
 +              if ((strcmp(bb_func_name, "error_entry") == 0) &&
 +                  (expect == BBRG_RBX ||
 +                   (expect == BBRG_RSP && bb_is_osp_defined(expect) && offset == -0x10)))
 +                      continue;
 +              kdb_printf("%s: Expected %s, got %s",
 +                         __FUNCTION__,
 +                         bbrg_name[expect], bbrg_name[actual]);
 +              if (offset)
 +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset, "", "");
 +              kdb_printf("\n");
 +              error = 1;
 +      }
 +      BB_CHECK(error, error, );
 +}
 +
 +/* Transfer of control.  Follow the arc and save the current state as input to
 + * another basic block.
 + */
 +
 +static void
 +bb_transfer(bfd_vma from, bfd_vma to, unsigned int drop_through)
 +{
 +      int i, found;
 +      size_t size;
 +      struct bb* bb = NULL;   /*stupid gcc */
 +      struct bb_jmp *bb_jmp;
 +      struct bb_reg_state *state;
 +      bb_reg_state_canonicalize();
 +      found = 0;
 +      for (i = 0; i < bb_jmp_count; ++i) {
 +              bb_jmp = bb_jmp_list + i;
 +              if (bb_jmp->from == from &&
 +                  bb_jmp->to == to &&
 +                  bb_jmp->drop_through == drop_through) {
 +                      found = 1;
 +                      break;
 +              }
 +      }
 +      if (!found) {
 +              /* Transfer outside the current function.  Check the special
 +               * cases (mainly in entry.S) first.  If it is not a known
 +               * special case then check if the target address is the start
 +               * of a function or not.  If it is the start of a function then
 +               * assume tail recursion and require that the state be the same
 +               * as on entry.  Otherwise assume out of line code (e.g.
 +               * spinlock contention path) and ignore it, the state can be
 +               * anything.
 +               */
 +              kdb_symtab_t symtab;
 +              if (bb_special_case(to))
 +                      return;
 +              kdbnearsym(to, &symtab);
 +              if (symtab.sym_start != to)
 +                      return;
 +              bb_sanity_check(0);
 +              if (bb_giveup)
 +                      return;
 +#ifdef        NO_SIBLINGS
 +              /* Only print this message when the kernel is compiled with
 +               * -fno-optimize-sibling-calls.  Otherwise it would print a
 +               * message for every tail recursion call.  If you see the
 +               * message below then you probably have an assembler label that
 +               * is not listed in the special cases.
 +               */
 +              kdb_printf("  not matched: from "
 +                         kdb_bfd_vma_fmt0
 +                         " to " kdb_bfd_vma_fmt0
 +                         " drop_through %d bb_jmp[%d]\n",
 +                         from, to, drop_through, i);
 +#endif        /* NO_SIBLINGS */
 +              return;
 +      }
 +      KDB_DEBUG_BB("  matched: from " kdb_bfd_vma_fmt0
 +                   " to " kdb_bfd_vma_fmt0
 +                   " drop_through %d bb_jmp[%d]\n",
 +                   from, to, drop_through, i);
 +      found = 0;
 +      for (i = 0; i < bb_count; ++i) {
 +              bb = bb_list[i];
 +              if (bb->start == to) {
 +                      found = 1;
 +                      break;
 +              }
 +      }
 +      BB_CHECK(!found, to, );
 +      /* If the register state for this arc has already been set (we are
 +       * rescanning the block that originates the arc) and the state is the
 +       * same as the previous state for this arc then this input to the
 +       * target block is the same as last time, so there is no need to rescan
 +       * the target block.
 +       */
 +      state = bb_jmp->state;
 +      size = bb_reg_state_size(bb_reg_state);
 +      if (state) {
 +              bb_reg_state->ref_count = state->ref_count;
 +              if (memcmp(state, bb_reg_state, size) == 0) {
 +                      KDB_DEBUG_BB("  no state change\n");
 +                      return;
 +              }
 +              if (--state->ref_count == 0)
 +                      debug_kfree(state);
 +              bb_jmp->state = NULL;
 +      }
 +      /* New input state is required.  To save space, check if any other arcs
 +       * have the same state and reuse them where possible.  The overall set
 +       * of inputs to the target block is now different so the target block
 +       * must be rescanned.
 +       */
 +      bb->changed = 1;
 +      for (i = 0; i < bb_jmp_count; ++i) {
 +              state = bb_jmp_list[i].state;
 +              if (!state)
 +                      continue;
 +              bb_reg_state->ref_count = state->ref_count;
 +              if (memcmp(state, bb_reg_state, size) == 0) {
 +                      KDB_DEBUG_BB("  reuse bb_jmp[%d]\n", i);
 +                      bb_jmp->state = state;
 +                      ++state->ref_count;
 +                      return;
 +              }
 +      }
 +      state = debug_kmalloc(size, GFP_ATOMIC);
 +      if (!state) {
 +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
 +              bb_giveup = 1;
 +              return;
 +      }
 +      memcpy(state, bb_reg_state, size);
 +      state->ref_count = 1;
 +      bb_jmp->state = state;
 +      KDB_DEBUG_BB("  new state %p\n", state);
 +}
 +
 +/* Isolate the processing for 'mov' so it can be used for 'xadd'/'xchg' as
 + * well.
 + *
 + * xadd/xchg expect this function to return BBOU_NOP for special cases,
 + * otherwise it returns BBOU_RSWD.  All special cases must be handled entirely
 + * within this function, including doing bb_read_operand or bb_write_operand
 + * where necessary.
 + */
 +
 +static enum bb_operand_usage
 +bb_usage_mov(const struct bb_operand *src, const struct bb_operand *dst, int l)
 +{
 +      int full_register_src, full_register_dst;
 +      full_register_src = bb_operand_length(src, bb_decode.opcode[l])
 +                          == KDB_WORD_SIZE * 8;
 +      full_register_dst = bb_operand_length(dst, bb_decode.opcode[l])
 +                          == KDB_WORD_SIZE * 8;
 +      /* If both src and dst are full integer registers then record the
 +       * register change.
 +       */
 +      if (src->reg &&
 +          bb_is_int_reg(src->base_rc) &&
 +          dst->reg &&
 +          bb_is_int_reg(dst->base_rc) &&
 +          full_register_src &&
 +          full_register_dst) {
 +              /* Special case for the code that switches stacks in
 +               * jprobe_return.  That code must modify RSP but it does it in
 +               * a well defined manner.  Do not invalidate RSP.
 +               */
 +              if (src->base_rc == BBRG_RBX &&
 +                  dst->base_rc == BBRG_RSP &&
 +                  strcmp(bb_func_name, "jprobe_return") == 0) {
 +                      bb_read_operand(src);
 +                      return BBOU_NOP;
 +              }
 +              /* math_abort takes the equivalent of a longjmp structure and
 +               * resets the stack.  Ignore this, it leaves RSP well defined.
 +               */
 +              if (dst->base_rc == BBRG_RSP &&
 +                  strcmp(bb_func_name, "math_abort") == 0) {
 +                      bb_read_operand(src);
 +                      return BBOU_NOP;
 +              }
 +              bb_reg_set_reg(dst->base_rc, src->base_rc);
 +              return BBOU_NOP;
 +      }
 +      /* If the move is from a full integer register to stack then record it.
 +       */
 +      if (src->reg &&
 +          bb_is_simple_memory(dst) &&
 +          bb_is_osp_defined(dst->base_rc) &&
 +          full_register_src) {
 +              /* Ugly special case.  Initializing list heads on stack causes
 +               * false references to stack variables when the list head is
 +               * used.  Static code analysis cannot detect that the list head
 +               * has been changed by a previous execution loop and that a
 +               * basic block is only executed after the list head has been
 +               * changed.
 +               *
 +               * These false references can result in valid stack variables
 +               * being incorrectly cleared on some logic paths.  Ignore
 +               * stores to stack variables which point to themselves or to
 +               * the previous word so the list head initialization is not
 +               * recorded.
 +               */
 +              if (bb_is_osp_defined(src->base_rc)) {
 +                      int stack1 = bb_reg_code_offset(src->base_rc);
 +                      int stack2 = bb_reg_code_offset(dst->base_rc) +
 +                                   dst->disp;
 +                      if (stack1 == stack2 ||
 +                          stack1 == stack2 - KDB_WORD_SIZE)
 +                              return BBOU_NOP;
 +              }
 +              bb_memory_set_reg(dst->base_rc, src->base_rc, dst->disp);
 +              return BBOU_NOP;
 +      }
 +      /* If the move is from stack to a full integer register then record it.
 +       */
 +      if (bb_is_simple_memory(src) &&
 +          bb_is_osp_defined(src->base_rc) &&
 +          dst->reg &&
 +          bb_is_int_reg(dst->base_rc) &&
 +          full_register_dst) {
 +#ifdef        CONFIG_X86_32
- #ifndef TSS_sysenter_sp0
- #define TSS_sysenter_sp0 SYSENTER_stack_sp0
- #endif
 +              /* mov from TSS_sysenter_sp0+offset to esp to fix up the
 +               * sysenter stack, it leaves esp well defined.  mov
 +               * TSS_ysenter_sp0+offset(%esp),%esp is followed by up to 5
 +               * push instructions to mimic the hardware stack push.  If
 +               * TSS_sysenter_sp0 is offset then only 3 words will be
 +               * pushed.
 +               */
 +              if (dst->base_rc == BBRG_RSP &&
 +                  src->disp >= TSS_sysenter_sp0 &&
 +                  bb_is_osp_defined(BBRG_RSP)) {
 +                      int pushes;
 +                      pushes = src->disp == TSS_sysenter_sp0 ? 5 : 3;
 +                      bb_reg_code_set_offset(BBRG_RSP,
 +                              bb_reg_code_offset(BBRG_RSP) +
 +                                      pushes * KDB_WORD_SIZE);
 +                      KDB_DEBUG_BB_OFFSET(
 +                              bb_reg_code_offset(BBRG_RSP),
 +                              "  sysenter fixup, RSP",
 +                             "\n");
 +                      return BBOU_NOP;
 +              }
 +#endif        /* CONFIG_X86_32 */
 +              bb_read_operand(src);
 +              bb_reg_set_memory(dst->base_rc, src->base_rc, src->disp);
 +              return BBOU_NOP;
 +      }
 +      /* move %gs:0x<nn>,%rsp is used to unconditionally switch to another
 +       * stack.  Ignore this special case, it is handled by the stack
 +       * unwinding code.
 +       */
 +      if (src->segment &&
 +          strcmp(src->segment, "%gs") == 0 &&
 +          dst->reg &&
 +          dst->base_rc == BBRG_RSP)
 +              return BBOU_NOP;
 +      /* move %reg,%reg is a nop */
 +      if (src->reg &&
 +          dst->reg &&
 +          !src->segment &&
 +          !dst->segment &&
 +          strcmp(src->base, dst->base) == 0)
 +              return BBOU_NOP;
 +      /* Special case for the code that switches stacks in the scheduler
 +       * (switch_to()).  That code must modify RSP but it does it in a well
 +       * defined manner.  Do not invalidate RSP.
 +       */
 +      if (dst->reg &&
 +          dst->base_rc == BBRG_RSP &&
 +          full_register_dst &&
 +          bb_is_scheduler_address()) {
 +              bb_read_operand(src);
 +              return BBOU_NOP;
 +      }
 +      /* Special case for the code that switches stacks in resume from
 +       * hibernation code.  That code must modify RSP but it does it in a
 +       * well defined manner.  Do not invalidate RSP.
 +       */
 +      if (src->memory &&
 +          dst->reg &&
 +          dst->base_rc == BBRG_RSP &&
 +          full_register_dst &&
 +          strcmp(bb_func_name, "restore_image") == 0) {
 +              bb_read_operand(src);
 +              return BBOU_NOP;
 +      }
 +      return BBOU_RSWD;
 +}
 +
 +static enum bb_operand_usage
 +bb_usage_xadd(const struct bb_operand *src, const struct bb_operand *dst)
 +{
 +      /* Simulate xadd as a series of instructions including mov, that way we
 +       * get the benefit of all the special cases already handled by
 +       * BBOU_MOV.
 +       *
 +       * tmp = src + dst, src = dst, dst = tmp.
 +       *
 +       * For tmp, pick a register that is undefined.  If all registers are
 +       * defined then pick one that is not being used by xadd.
 +       */
 +      enum bb_reg_code reg = BBRG_UNDEFINED;
 +      struct bb_operand tmp;
 +      struct bb_reg_contains save_tmp;
 +      enum bb_operand_usage usage;
 +      int undefined = 0;
 +      for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
 +              if (bb_reg_code_value(reg) == BBRG_UNDEFINED) {
 +                      undefined = 1;
 +                      break;
 +              }
 +      }
 +      if (!undefined) {
 +              for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
 +                      if (reg != src->base_rc &&
 +                          reg != src->index_rc &&
 +                          reg != dst->base_rc &&
 +                          reg != dst->index_rc &&
 +                          reg != BBRG_RSP)
 +                              break;
 +              }
 +      }
 +      KDB_DEBUG_BB("  %s saving tmp %s\n", __FUNCTION__, bbrg_name[reg]);
 +      save_tmp = bb_reg_state->contains[reg - BBRG_RAX];
 +      bb_reg_set_undef(reg);
 +      memset(&tmp, 0, sizeof(tmp));
 +      tmp.present = 1;
 +      tmp.reg = 1;
 +      tmp.base = debug_kmalloc(strlen(bbrg_name[reg]) + 2, GFP_ATOMIC);
 +      if (tmp.base) {
 +              tmp.base[0] = '%';
 +              strcpy(tmp.base + 1, bbrg_name[reg]);
 +      }
 +      tmp.base_rc = reg;
 +      bb_read_operand(src);
 +      bb_read_operand(dst);
 +      if (bb_usage_mov(src, dst, sizeof("xadd")-1) == BBOU_NOP)
 +              usage = BBOU_RSRD;
 +      else
 +              usage = BBOU_RSRDWS;
 +      bb_usage_mov(&tmp, dst, sizeof("xadd")-1);
 +      KDB_DEBUG_BB("  %s restoring tmp %s\n", __FUNCTION__, bbrg_name[reg]);
 +      bb_reg_state->contains[reg - BBRG_RAX] = save_tmp;
 +      debug_kfree(tmp.base);
 +      return usage;
 +}
 +
 +static enum bb_operand_usage
 +bb_usage_xchg(const struct bb_operand *src, const struct bb_operand *dst)
 +{
 +      /* Simulate xchg as a series of mov instructions, that way we get the
 +       * benefit of all the special cases already handled by BBOU_MOV.
 +       *
 +       * mov dst,tmp; mov src,dst; mov tmp,src;
 +       *
 +       * For tmp, pick a register that is undefined.  If all registers are
 +       * defined then pick one that is not being used by xchg.
 +       */
 +      enum bb_reg_code reg = BBRG_UNDEFINED;
 +      int rs = BBOU_RS, rd = BBOU_RD, ws = BBOU_WS, wd = BBOU_WD;
 +      struct bb_operand tmp;
 +      struct bb_reg_contains save_tmp;
 +      int undefined = 0;
 +      for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
 +              if (bb_reg_code_value(reg) == BBRG_UNDEFINED) {
 +                      undefined = 1;
 +                      break;
 +              }
 +      }
 +      if (!undefined) {
 +              for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
 +                      if (reg != src->base_rc &&
 +                          reg != src->index_rc &&
 +                          reg != dst->base_rc &&
 +                          reg != dst->index_rc &&
 +                          reg != BBRG_RSP)
 +                              break;
 +              }
 +      }
 +      KDB_DEBUG_BB("  %s saving tmp %s\n", __FUNCTION__, bbrg_name[reg]);
 +      save_tmp = bb_reg_state->contains[reg - BBRG_RAX];
 +      memset(&tmp, 0, sizeof(tmp));
 +      tmp.present = 1;
 +      tmp.reg = 1;
 +      tmp.base = debug_kmalloc(strlen(bbrg_name[reg]) + 2, GFP_ATOMIC);
 +      if (tmp.base) {
 +              tmp.base[0] = '%';
 +              strcpy(tmp.base + 1, bbrg_name[reg]);
 +      }
 +      tmp.base_rc = reg;
 +      if (bb_usage_mov(dst, &tmp, sizeof("xchg")-1) == BBOU_NOP)
 +              rd = 0;
 +      if (bb_usage_mov(src, dst, sizeof("xchg")-1) == BBOU_NOP) {
 +              rs = 0;
 +              wd = 0;
 +      }
 +      if (bb_usage_mov(&tmp, src, sizeof("xchg")-1) == BBOU_NOP)
 +              ws = 0;
 +      KDB_DEBUG_BB("  %s restoring tmp %s\n", __FUNCTION__, bbrg_name[reg]);
 +      bb_reg_state->contains[reg - BBRG_RAX] = save_tmp;
 +      debug_kfree(tmp.base);
 +      return rs | rd | ws | wd;
 +}
 +
 +/* Invalidate all the scratch registers */
 +
 +static void
 +bb_invalidate_scratch_reg(void)
 +{
 +      int i, j;
 +      for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
 +              for (j = 0; j < ARRAY_SIZE(bb_preserved_reg); ++j) {
 +                      if (i == bb_preserved_reg[j])
 +                              goto preserved;
 +              }
 +              bb_reg_set_undef(i);
 +preserved:
 +              continue;
 +      }
 +}
 +
 +static void
 +bb_pass2_computed_jmp(const struct bb_operand *src)
 +{
 +      unsigned long table = src->disp;
 +      kdb_machreg_t addr;
 +      while (!bb_giveup) {
 +              if (kdb_getword(&addr, table, sizeof(addr)))
 +                      return;
 +              if (addr < bb_func_start || addr >= bb_func_end)
 +                      return;
 +              bb_transfer(bb_curr_addr, addr, 0);
 +              table += KDB_WORD_SIZE;
 +      }
 +}
 +
 +/* The current instruction has been decoded and all the information is in
 + * bb_decode.  Based on the opcode, track any operand usage that we care about.
 + */
 +
 +static void
 +bb_usage(void)
 +{
 +      enum bb_operand_usage usage = bb_decode.match->usage;
 +      struct bb_operand *src = &bb_decode.src;
 +      struct bb_operand *dst = &bb_decode.dst;
 +      struct bb_operand *dst2 = &bb_decode.dst2;
 +      int opcode_suffix, operand_length;
 +
 +      /* First handle all the special usage cases, and map them to a generic
 +       * case after catering for the side effects.
 +       */
 +
 +      if (usage == BBOU_IMUL &&
 +          src->present && !dst->present && !dst2->present) {
 +              /* single operand imul, same effects as mul */
 +              usage = BBOU_MUL;
 +      }
 +
 +      /* AT&T syntax uses movs<l1><l2> for move with sign extension, instead
 +       * of the Intel movsx.  The AT&T syntax causes problems for the opcode
 +       * mapping; movs with sign extension needs to be treated as a generic
 +       * read src, write dst, but instead it falls under the movs I/O
 +       * instruction.  Fix it.
 +       */
 +      if (usage == BBOU_MOVS && strlen(bb_decode.opcode) > 5)
 +              usage = BBOU_RSWD;
 +
 +      /* This switch statement deliberately does not use 'default' at the top
 +       * level.  That way the compiler will complain if a new BBOU_ enum is
 +       * added above and not explicitly handled here.
 +       */
 +      switch (usage) {
 +      case BBOU_UNKNOWN:      /* drop through */
 +      case BBOU_RS:           /* drop through */
 +      case BBOU_RD:           /* drop through */
 +      case BBOU_RSRD:         /* drop through */
 +      case BBOU_WS:           /* drop through */
 +      case BBOU_RSWS:         /* drop through */
 +      case BBOU_RDWS:         /* drop through */
 +      case BBOU_RSRDWS:       /* drop through */
 +      case BBOU_WD:           /* drop through */
 +      case BBOU_RSWD:         /* drop through */
 +      case BBOU_RDWD:         /* drop through */
 +      case BBOU_RSRDWD:       /* drop through */
 +      case BBOU_WSWD:         /* drop through */
 +      case BBOU_RSWSWD:       /* drop through */
 +      case BBOU_RDWSWD:       /* drop through */
 +      case BBOU_RSRDWSWD:
 +              break;          /* ignore generic usage for now */
 +      case BBOU_ADD:
 +              /* Special case for add instructions that adjust registers
 +               * which are mapping the stack.
 +               */
 +              if (dst->reg && bb_is_osp_defined(dst->base_rc)) {
 +                      bb_adjust_osp_instruction(1);
 +                      usage = BBOU_RS;
 +              } else {
 +                      usage = BBOU_RSRDWD;
 +              }
 +              break;
 +      case BBOU_AND:
 +              /* Special case when trying to round the stack pointer
 +               * to achieve byte alignment
 +               */
 +              if (dst->reg && dst->base_rc == BBRG_RSP &&
 +                      src->immediate && strncmp(bb_func_name, "efi_call", 8) == 0) {
 +                              usage = BBOU_NOP;
 +              } else {
 +                      usage = BBOU_RSRDWD;
 +              }
 +              break;
 +      case BBOU_CALL:
 +              bb_reg_state_print(bb_reg_state);
 +              usage = BBOU_NOP;
 +              if (bb_is_static_disp(src)) {
 +                      /* save_args is special.  It saves
 +                       * a partial pt_regs onto the stack and switches
 +                       * to the interrupt stack.
 +                       */
 +                      if (src->disp == bb_save_args) {
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDI, 0x48);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RSI, 0x40);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDX, 0x38);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RCX, 0x30);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RAX, 0x28);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R8,  0x20);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R9,  0x18);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R10, 0x10);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R11, 0x08);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0);
 +                              /* This is actually on the interrupt stack,
 +                               * but we fudge it so the unwind works.
 +                               */
 +                              bb_memory_set_reg_value(BBRG_RSP, -0x8, BBRG_RBP, 0);
 +                              bb_reg_set_reg(BBRG_RBP, BBRG_RSP);
 +                              bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
 +                      }
 +                      /* save_rest juggles the stack frame to append the
 +                       * rest of the pt_regs onto a stack where SAVE_ARGS
 +                       * or save_args has already been done.
 +                       */
 +                      else if (src->disp == bb_save_rest) {
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBX, 0x30);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0x28);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R12, 0x20);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R13, 0x18);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R14, 0x10);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R15, 0x08);
 +                      }
 +                      /* error_entry and save_paranoid save a full pt_regs.
 +                       * Break out so the scratch registers aren't invalidated.
 +                       */
 +                      else if (src->disp == bb_error_entry || src->disp == bb_save_paranoid) {
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDI, 0x70);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RSI, 0x68);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDX, 0x60);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RCX, 0x58);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RAX, 0x50);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R8,  0x48);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R9,  0x40);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R10, 0x38);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R11, 0x30);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBX, 0x28);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0x20);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R12, 0x18);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R13, 0x10);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R14, 0x08);
 +                              bb_memory_set_reg(BBRG_RSP, BBRG_R15, 0);
 +                              break;
 +                      }
 +              }
 +              /* Invalidate the scratch registers */
 +              bb_invalidate_scratch_reg();
 +
 +              /* These special cases need scratch registers invalidated first */
 +              if (bb_is_static_disp(src)) {
 +                      /* Function sync_regs and save_v86_state are special.
 +                       * Their return value is the new stack pointer
 +                       */
 +                      if (src->disp == bb_sync_regs) {
 +                              bb_reg_set_reg(BBRG_RAX, BBRG_RSP);
 +                      } else if (src->disp == bb_save_v86_state) {
 +                              bb_reg_set_reg(BBRG_RAX, BBRG_RSP);
 +                              bb_adjust_osp(BBRG_RAX, +KDB_WORD_SIZE);
 +                      }
 +              }
 +              break;
 +      case BBOU_CBW:
 +              /* Convert word in RAX.  Read RAX, write RAX */
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RAX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_CMOV:
 +              /* cmove %gs:0x<nn>,%rsp is used to conditionally switch to
 +               * another stack.  Ignore this special case, it is handled by
 +               * the stack unwinding code.
 +               */
 +              if (src->segment &&
 +                  strcmp(src->segment, "%gs") == 0 &&
 +                  dst->reg &&
 +                  dst->base_rc == BBRG_RSP)
 +                      usage = BBOU_NOP;
 +              else
 +                      usage = BBOU_RSWD;
 +              break;
 +      case BBOU_CMPXCHG:
 +              /* Read RAX, write RAX plus src read, dst write */
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RAX);
 +              usage = BBOU_RSWD;
 +              break;
 +      case BBOU_CMPXCHGD:
 +              /* Read RAX, RBX, RCX, RDX, write RAX, RDX plus src read/write */
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_read(BBRG_RBX);
 +              bb_reg_read(BBRG_RCX);
 +              bb_reg_read(BBRG_RDX);
 +              bb_reg_set_undef(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RDX);
 +              usage = BBOU_RSWS;
 +              break;
 +      case BBOU_CPUID:
 +              /* Read RAX, write RAX, RBX, RCX, RDX */
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RBX);
 +              bb_reg_set_undef(BBRG_RCX);
 +              bb_reg_set_undef(BBRG_RDX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_CWD:
 +              /* Convert word in RAX, RDX.  Read RAX, write RDX */
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RDX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_DIV:  /* drop through */
 +      case BBOU_IDIV:
 +              /* The 8 bit variants only affect RAX, the 16, 32 and 64 bit
 +               * variants affect RDX as well.
 +               */
 +              switch (usage) {
 +              case BBOU_DIV:
 +                      opcode_suffix = bb_decode.opcode[3];
 +                      break;
 +              case BBOU_IDIV:
 +                      opcode_suffix = bb_decode.opcode[4];
 +                      break;
 +              default:
 +                      opcode_suffix = 'q';
 +                      break;
 +              }
 +              operand_length = bb_operand_length(src, opcode_suffix);
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RAX);
 +              if (operand_length != 8) {
 +                      bb_reg_read(BBRG_RDX);
 +                      bb_reg_set_undef(BBRG_RDX);
 +              }
 +              usage = BBOU_RS;
 +              break;
 +      case BBOU_IMUL:
 +              /* Only the two and three operand forms get here.  The one
 +               * operand form is treated as mul.
 +               */
 +              if (dst2->present) {
 +                      /* The three operand form is a special case, read the first two
 +                       * operands, write the third.
 +                       */
 +                      bb_read_operand(src);
 +                      bb_read_operand(dst);
 +                      bb_write_operand(dst2);
 +                      usage = BBOU_NOP;
 +              } else {
 +                      usage = BBOU_RSRDWD;
 +              }
 +              break;
 +      case BBOU_IRET:
 +              bb_sanity_check(0);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_JMP:
 +              if (bb_is_static_disp(src))
 +                      bb_transfer(bb_curr_addr, src->disp, 0);
 +              else if (src->indirect &&
 +                       src->disp &&
 +                       src->base == NULL &&
 +                       src->index &&
 +                       src->scale == KDB_WORD_SIZE)
 +                      bb_pass2_computed_jmp(src);
 +              usage = BBOU_RS;
 +              break;
 +      case BBOU_LAHF:
 +              /* Write RAX */
 +              bb_reg_set_undef(BBRG_RAX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_LEA:
 +              /* dst = src + disp.  Often used to calculate offsets into the
 +               * stack, so check if it uses a stack pointer.
 +               */
 +              usage = BBOU_RSWD;
 +              if (bb_is_simple_memory(src)) {
 +                     if (bb_is_osp_defined(src->base_rc)) {
 +                              bb_reg_set_reg(dst->base_rc, src->base_rc);
 +                              bb_adjust_osp_instruction(1);
 +                              usage = BBOU_RS;
 +                      } else if (src->disp == 0 &&
 +                                 src->base_rc == dst->base_rc) {
 +                              /* lea 0(%reg),%reg is generated by i386
 +                               * GENERIC_NOP7.
 +                               */
 +                              usage = BBOU_NOP;
 +                      } else if (src->disp == 4096 &&
 +                                 (src->base_rc == BBRG_R8 ||
 +                                  src->base_rc == BBRG_RDI) &&
 +                                 strcmp(bb_func_name, "relocate_kernel") == 0) {
 +                              /* relocate_kernel: setup a new stack at the
 +                               * end of the physical control page, using
 +                               * (x86_64) lea 4096(%r8),%rsp or (i386) lea
 +                               * 4096(%edi),%esp
 +                               */
 +                              usage = BBOU_NOP;
 +                      }
 +              }
 +              break;
 +      case BBOU_LEAVE:
 +              /* RSP = RBP; RBP = *(RSP); RSP += KDB_WORD_SIZE; */
 +              bb_reg_set_reg(BBRG_RSP, BBRG_RBP);
 +              if (bb_is_osp_defined(BBRG_RSP))
 +                      bb_reg_set_memory(BBRG_RBP, BBRG_RSP, 0);
 +              else
 +                      bb_reg_set_undef(BBRG_RBP);
 +              if (bb_is_osp_defined(BBRG_RSP))
 +                      bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
 +              /* common_interrupt uses leave in a non-standard manner */
 +              if (strcmp(bb_func_name, "common_interrupt") != 0)
 +                      bb_sanity_check(0);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_LODS:
 +              /* Read RSI, write RAX, RSI */
 +              bb_reg_read(BBRG_RSI);
 +              bb_reg_set_undef(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RSI);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_LOOP:
 +              /* Read and write RCX */
 +              bb_reg_read(BBRG_RCX);
 +              bb_reg_set_undef(BBRG_RCX);
 +              if (bb_is_static_disp(src))
 +                      bb_transfer(bb_curr_addr, src->disp, 0);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_LSS:
 +              /* lss offset(%esp),%esp leaves esp well defined */
 +              if (dst->reg &&
 +                  dst->base_rc == BBRG_RSP &&
 +                  bb_is_simple_memory(src) &&
 +                  src->base_rc == BBRG_RSP) {
 +                      bb_adjust_osp(BBRG_RSP, 2*KDB_WORD_SIZE + src->disp);
 +                      usage = BBOU_NOP;
 +              } else {
 +                      usage = BBOU_RSWD;
 +              }
 +              break;
 +      case BBOU_MONITOR:
 +              /* Read RAX, RCX, RDX */
 +              bb_reg_set_undef(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RCX);
 +              bb_reg_set_undef(BBRG_RDX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_MOV:
 +              usage = bb_usage_mov(src, dst, sizeof("mov")-1);
 +              break;
 +      case BBOU_MOVS:
 +              /* Read RSI, RDI, write RSI, RDI */
 +              bb_reg_read(BBRG_RSI);
 +              bb_reg_read(BBRG_RDI);
 +              bb_reg_set_undef(BBRG_RSI);
 +              bb_reg_set_undef(BBRG_RDI);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_MUL:
 +              /* imul (one operand form only) or mul.  Read RAX.  If the
 +               * operand length is not 8 then write RDX.
 +               */
 +              if (bb_decode.opcode[0] == 'i')
 +                      opcode_suffix = bb_decode.opcode[4];
 +              else
 +                      opcode_suffix = bb_decode.opcode[3];
 +              operand_length = bb_operand_length(src, opcode_suffix);
 +              bb_reg_read(BBRG_RAX);
 +              if (operand_length != 8)
 +                      bb_reg_set_undef(BBRG_RDX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_MWAIT:
 +              /* Read RAX, RCX */
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_read(BBRG_RCX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_NOP:
 +              break;
 +      case BBOU_OUTS:
 +              /* Read RSI, RDX, write RSI */
 +              bb_reg_read(BBRG_RSI);
 +              bb_reg_read(BBRG_RDX);
 +              bb_reg_set_undef(BBRG_RSI);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_POP:
 +              /* Complicated by the fact that you can pop from top of stack
 +               * to a stack location, for this case the destination location
 +               * is calculated after adjusting RSP.  Analysis of the kernel
 +               * code shows that gcc only uses this strange format to get the
 +               * flags into a local variable, e.g. pushf; popl 0x10(%esp); so
 +               * I am going to ignore this special case.
 +               */
 +              usage = BBOU_WS;
 +              if (!bb_is_osp_defined(BBRG_RSP)) {
 +                      if (!bb_is_scheduler_address()) {
 +                              kdb_printf("pop when BBRG_RSP is undefined?\n");
 +                              bb_giveup = 1;
 +                      }
 +              } else {
 +                      if (src->reg) {
 +                              bb_reg_set_memory(src->base_rc, BBRG_RSP, 0);
 +                              usage = BBOU_NOP;
 +                      }
 +                      /* pop %rsp does not adjust rsp */
 +                      if (!src->reg ||
 +                          src->base_rc != BBRG_RSP)
 +                              bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
 +              }
 +              break;
 +      case BBOU_POPF:
 +              /* Do not care about flags, just adjust RSP */
 +              if (!bb_is_osp_defined(BBRG_RSP)) {
 +                      if (!bb_is_scheduler_address()) {
 +                              kdb_printf("popf when BBRG_RSP is undefined?\n");
 +                              bb_giveup = 1;
 +                      }
 +              } else {
 +                      bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
 +              }
 +              usage = BBOU_WS;
 +              break;
 +      case BBOU_PUSH:
 +              /* Complicated by the fact that you can push from a stack
 +               * location to top of stack, the source location is calculated
 +               * before adjusting RSP.  Analysis of the kernel code shows
 +               * that gcc only uses this strange format to restore the flags
 +               * from a local variable, e.g. pushl 0x10(%esp); popf; so I am
 +               * going to ignore this special case.
 +               */
 +              usage = BBOU_RS;
 +              if (!bb_is_osp_defined(BBRG_RSP)) {
 +                      if (!bb_is_scheduler_address()) {
 +                              kdb_printf("push when BBRG_RSP is undefined?\n");
 +                              bb_giveup = 1;
 +                      }
 +              } else {
 +                      bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
 +                      if (src->reg &&
 +                          bb_reg_code_offset(BBRG_RSP) <= 0)
 +                              bb_memory_set_reg(BBRG_RSP, src->base_rc, 0);
 +              }
 +              break;
 +      case BBOU_PUSHF:
 +              /* Do not care about flags, just adjust RSP */
 +              if (!bb_is_osp_defined(BBRG_RSP)) {
 +                      if (!bb_is_scheduler_address()) {
 +                              kdb_printf("pushf when BBRG_RSP is undefined?\n");
 +                              bb_giveup = 1;
 +                      }
 +              } else {
 +                      bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
 +              }
 +              usage = BBOU_WS;
 +              break;
 +      case BBOU_RDMSR:
 +              /* Read RCX, write RAX, RDX */
 +              bb_reg_read(BBRG_RCX);
 +              bb_reg_set_undef(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RDX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_RDTSC:
 +              /* Write RAX, RDX */
 +              bb_reg_set_undef(BBRG_RAX);
 +              bb_reg_set_undef(BBRG_RDX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_RET:
 +              usage = BBOU_NOP;
 +              if (src->immediate && bb_is_osp_defined(BBRG_RSP)) {
 +                      bb_adjust_osp(BBRG_RSP, src->disp);
 +              }
 +              /* Functions that restore state which was saved by another
 +               * function or build new kernel stacks.  We cannot verify what
 +               * is being restored so skip the sanity check.
 +               */
 +              if (strcmp(bb_func_name, "restore_image") == 0 ||
 +                  strcmp(bb_func_name, "relocate_kernel") == 0 ||
 +                  strcmp(bb_func_name, "identity_mapped") == 0 ||
 +                  strcmp(bb_func_name, "xen_iret_crit_fixup") == 0 ||
 +                  strcmp(bb_func_name, "math_abort") == 0 ||
 +                  strcmp(bb_func_name, "save_args") == 0 ||
 +                  strcmp(bb_func_name, "kretprobe_trampoline_holder") == 0)
 +                      break;
 +              bb_sanity_check(0);
 +              break;
 +      case BBOU_SAHF:
 +              /* Read RAX */
 +              bb_reg_read(BBRG_RAX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_SCAS:
 +              /* Read RAX, RDI, write RDI */
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_read(BBRG_RDI);
 +              bb_reg_set_undef(BBRG_RDI);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_SUB:
 +              /* Special case for sub instructions that adjust registers
 +               * which are mapping the stack.
 +               */
 +              if (dst->reg && bb_is_osp_defined(dst->base_rc)) {
 +                      bb_adjust_osp_instruction(-1);
 +                      usage = BBOU_RS;
 +              } else {
 +                      usage = BBOU_RSRDWD;
 +              }
 +              break;
 +      case BBOU_SYSEXIT:
 +              bb_sanity_check(1);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_SYSRET:
 +              bb_sanity_check(1);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_WRMSR:
 +              /* Read RCX, RAX, RDX */
 +              bb_reg_read(BBRG_RCX);
 +              bb_reg_read(BBRG_RAX);
 +              bb_reg_read(BBRG_RDX);
 +              usage = BBOU_NOP;
 +              break;
 +      case BBOU_XADD:
 +              usage = bb_usage_xadd(src, dst);
 +              break;
 +      case BBOU_XCHG:
 +              /* i386 do_IRQ with 4K stacks does xchg %ebx,%esp; call
 +               * irq_handler; mov %ebx,%esp; to switch stacks.  Ignore this
 +               * stack switch when tracking registers, it is handled by
 +               * higher level backtrace code.  Convert xchg %ebx,%esp to mov
 +               * %esp,%ebx so the later mov %ebx,%esp becomes a NOP and the
 +               * stack remains defined so we can backtrace through do_IRQ's
 +               * stack switch.
 +               *
 +               * Ditto for do_softirq.
 +               */
 +              if (src->reg &&
 +                  dst->reg &&
 +                  src->base_rc == BBRG_RBX &&
 +                  dst->base_rc == BBRG_RSP &&
 +                  (strcmp(bb_func_name, "do_IRQ") == 0 ||
 +                   strcmp(bb_func_name, "do_softirq") == 0)) {
 +                      strcpy(bb_decode.opcode, "mov");
 +                      usage = bb_usage_mov(dst, src, sizeof("mov")-1);
 +              } else {
 +                      usage = bb_usage_xchg(src, dst);
 +              }
 +              break;
 +      case BBOU_XOR:
 +              /* xor %reg,%reg only counts as a register write, the original
 +               * contents of reg are irrelevant.
 +               */
 +              if (src->reg && dst->reg && src->base_rc == dst->base_rc)
 +                      usage = BBOU_WS;
 +              else
 +                      usage = BBOU_RSRDWD;
 +              break;
 +      }
 +
 +      /* The switch statement above handled all the special cases.  Every
 +       * opcode should now have a usage of NOP or one of the generic cases.
 +       */
 +      if (usage == BBOU_UNKNOWN || usage == BBOU_NOP) {
 +              /* nothing to do */
 +      } else if (usage >= BBOU_RS && usage <= BBOU_RSRDWSWD) {
 +              if (usage & BBOU_RS)
 +                      bb_read_operand(src);
 +              if (usage & BBOU_RD)
 +                      bb_read_operand(dst);
 +              if (usage & BBOU_WS)
 +                      bb_write_operand(src);
 +              if (usage & BBOU_WD)
 +                      bb_write_operand(dst);
 +      } else {
 +              kdb_printf("%s: opcode not fully handled\n", __FUNCTION__);
 +              if (!KDB_DEBUG(BB)) {
 +                      bb_print_opcode();
 +                      if (bb_decode.src.present)
 +                              bb_print_operand("src", &bb_decode.src);
 +                      if (bb_decode.dst.present)
 +                              bb_print_operand("dst", &bb_decode.dst);
 +                      if (bb_decode.dst2.present)
 +                              bb_print_operand("dst2", &bb_decode.dst2);
 +              }
 +              bb_giveup = 1;
 +      }
 +}
 +
 +static void
 +bb_parse_buffer(void)
 +{
 +      char *p, *src, *dst = NULL, *dst2 = NULL;
 +      int paren = 0;
 +      p = bb_buffer;
 +      memset(&bb_decode, 0, sizeof(bb_decode));
 +      KDB_DEBUG_BB(" '%s'\n", p);
 +      p += strcspn(p, ":");   /* skip address and function name+offset: */
 +      if (*p++ != ':') {
 +              kdb_printf("%s: cannot find ':' in buffer '%s'\n",
 +                         __FUNCTION__, bb_buffer);
 +              bb_giveup = 1;
 +              return;
 +      }
 +      p += strspn(p, " \t");  /* step to opcode */
 +      if (strncmp(p, "(bad)", 5) == 0)
 +              strcpy(p, "nop");
 +      /* separate any opcode prefix */
 +      if (strncmp(p, "lock", 4) == 0 ||
 +          strncmp(p, "rep", 3) == 0 ||
 +          strncmp(p, "rex", 3) == 0 ||
 +          strncmp(p, "addr", 4) == 0) {
 +              bb_decode.prefix = p;
 +              p += strcspn(p, " \t");
 +              *p++ = '\0';
 +              p += strspn(p, " \t");
 +      }
 +      bb_decode.opcode = p;
 +      strsep(&p, " \t");      /* step to end of opcode */
 +      if (bb_parse_opcode())
 +              return;
 +      if (!p)
 +              goto no_operands;
 +      p += strspn(p, " \t");  /* step to operand(s) */
 +      if (!*p)
 +              goto no_operands;
 +      src = p;
 +      p = strsep(&p, " \t");  /* strip comments after operands */
 +      /* split 'src','dst' but ignore ',' inside '(' ')' */
 +      while (*p) {
 +              if (*p == '(') {
 +                      ++paren;
 +              } else if (*p == ')') {
 +                      --paren;
 +              } else if (*p == ',' && paren == 0) {
 +                      *p = '\0';
 +                      if (dst)
 +                              dst2 = p+1;
 +                      else
 +                              dst = p+1;
 +              }
 +              ++p;
 +      }
 +      bb_parse_operand(src, &bb_decode.src);
 +      if (KDB_DEBUG(BB))
 +              bb_print_operand("src", &bb_decode.src);
 +      if (dst && !bb_giveup) {
 +              bb_parse_operand(dst, &bb_decode.dst);
 +              if (KDB_DEBUG(BB))
 +                      bb_print_operand("dst", &bb_decode.dst);
 +      }
 +      if (dst2 && !bb_giveup) {
 +              bb_parse_operand(dst2, &bb_decode.dst2);
 +              if (KDB_DEBUG(BB))
 +                      bb_print_operand("dst2", &bb_decode.dst2);
 +      }
 +no_operands:
 +      if (!bb_giveup)
 +              bb_usage();
 +}
 +
 +static int
 +bb_dis_pass2(PTR file, const char *fmt, ...)
 +{
 +      char *p;
 +      int l = strlen(bb_buffer);
 +      va_list ap;
 +      va_start(ap, fmt);
 +      vsnprintf(bb_buffer + l, sizeof(bb_buffer) - l, fmt, ap);
 +      va_end(ap);
 +      if ((p = strchr(bb_buffer, '\n'))) {
 +              *p = '\0';
 +              p = bb_buffer;
 +              p += strcspn(p, ":");
 +              if (*p++ == ':')
 +                      bb_fixup_switch_to(p);
 +              bb_parse_buffer();
 +              bb_buffer[0] = '\0';
 +      }
 +      return 0;
 +}
 +
 +static void
 +bb_printaddr_pass2(bfd_vma addr, disassemble_info *dip)
 +{
 +      kdb_symtab_t symtab;
 +      unsigned int offset;
 +      dip->fprintf_func(dip->stream, "0x%lx", addr);
 +      kdbnearsym(addr, &symtab);
 +      if (symtab.sym_name) {
 +              dip->fprintf_func(dip->stream, " <%s", symtab.sym_name);
 +              if ((offset = addr - symtab.sym_start))
 +                      dip->fprintf_func(dip->stream, "+0x%x", offset);
 +              dip->fprintf_func(dip->stream, ">");
 +      }
 +}
 +
 +/* Set the starting register and memory state for the current bb */
 +
 +static void
 +bb_start_block0_special(void)
 +{
 +      int i;
 +      short offset_address;
 +      enum bb_reg_code reg, value;
 +      struct bb_name_state *r;
 +      for (i = 0, r = bb_special_cases;
 +           i < ARRAY_SIZE(bb_special_cases);
 +           ++i, ++r) {
 +              if (bb_func_start == r->address && r->fname == NULL)
 +                      goto match;
 +      }
 +      return;
 +match:
 +      /* Set the running registers */
 +      for (reg = BBRG_RAX; reg < r->regs_size; ++reg) {
 +              value = r->regs[reg].value;
 +              if (test_bit(value, r->skip_regs.bits)) {
 +                      /* this regs entry is not defined for this label */
 +                      continue;
 +              }
 +              bb_reg_code_set_value(reg, value);
 +              bb_reg_code_set_offset(reg, r->regs[reg].offset);
 +      }
 +      /* Set any memory contents, e.g. pt_regs.  Adjust RSP as required. */
 +      offset_address = 0;
 +      for (i = 0; i < r->mem_size; ++i) {
 +              offset_address = max_t(int,
 +                              r->mem[i].offset_address + KDB_WORD_SIZE,
 +                              offset_address);
 +      }
 +      if (bb_reg_code_offset(BBRG_RSP) > -offset_address)
 +              bb_adjust_osp(BBRG_RSP, -offset_address - bb_reg_code_offset(BBRG_RSP));
 +      for (i = 0; i < r->mem_size; ++i) {
 +              value = r->mem[i].value;
 +              if (test_bit(value, r->skip_mem.bits)) {
 +                      /* this memory entry is not defined for this label */
 +                      continue;
 +              }
 +              bb_memory_set_reg_value(BBRG_RSP, r->mem[i].offset_address,
 +                                      value, 0);
 +              bb_reg_set_undef(value);
 +      }
 +      return;
 +}
 +
 +static void
 +bb_pass2_start_block(int number)
 +{
 +      int i, j, k, first, changed;
 +      size_t size;
 +      struct bb_jmp *bb_jmp;
 +      struct bb_reg_state *state;
 +      struct bb_memory_contains *c1, *c2;
 +      bb_reg_state->mem_count = bb_reg_state_max;
 +      size = bb_reg_state_size(bb_reg_state);
 +      memset(bb_reg_state, 0, size);
 +
 +      if (number == 0) {
 +              /* The first block is assumed to have well defined inputs */
 +              bb_start_block0();
 +              /* Some assembler labels have non-standard entry
 +               * states.
 +               */
 +              bb_start_block0_special();
 +              bb_reg_state_print(bb_reg_state);
 +              return;
 +      }
 +
 +      /* Merge all the input states for the current bb together */
 +      first = 1;
 +      changed = 0;
 +      for (i = 0; i < bb_jmp_count; ++i) {
 +              bb_jmp = bb_jmp_list + i;
 +              if (bb_jmp->to != bb_curr->start)
 +                      continue;
 +              state = bb_jmp->state;
 +              if (!state)
 +                      continue;
 +              if (first) {
 +                      size = bb_reg_state_size(state);
 +                      memcpy(bb_reg_state, state, size);
 +                      KDB_DEBUG_BB("  first state %p\n", state);
 +                      bb_reg_state_print(bb_reg_state);
 +                      first = 0;
 +                      continue;
 +              }
 +
 +              KDB_DEBUG_BB("  merging state %p\n", state);
 +              /* Merge the register states */
 +              for (j = 0; j < ARRAY_SIZE(state->contains); ++j) {
 +                      if (memcmp(bb_reg_state->contains + j,
 +                                 state->contains + j,
 +                                 sizeof(bb_reg_state->contains[0]))) {
 +                              /* Different states for this register from two
 +                               * or more inputs, make it undefined.
 +                               */
 +                              if (bb_reg_state->contains[j].value ==
 +                                  BBRG_UNDEFINED) {
 +                                      KDB_DEBUG_BB("  ignoring %s\n",
 +                                                  bbrg_name[j + BBRG_RAX]);
 +                              } else {
 +                                      bb_reg_set_undef(BBRG_RAX + j);
 +                                      changed = 1;
 +                              }
 +                      }
 +              }
 +
 +              /* Merge the memory states.  This relies on both
 +               * bb_reg_state->memory and state->memory being sorted in
 +               * descending order, with undefined entries at the end.
 +               */
 +              c1 = bb_reg_state->memory;
 +              c2 = state->memory;
 +              j = k = 0;
 +              while (j < bb_reg_state->mem_count &&
 +                     k < state->mem_count) {
 +                      if (c1->offset_address < c2->offset_address) {
 +                              KDB_DEBUG_BB_OFFSET(c2->offset_address,
 +                                                  "  ignoring c2->offset_address ",
 +                                                  "\n");
 +                              ++c2;
 +                              ++k;
 +                              continue;
 +                      }
 +                      if (c1->offset_address > c2->offset_address) {
 +                              /* Memory location is not in all input states,
 +                               * delete the memory location.
 +                               */
 +                              bb_delete_memory(c1->offset_address);
 +                              changed = 1;
 +                              ++c1;
 +                              ++j;
 +                              continue;
 +                      }
 +                      if (memcmp(c1, c2, sizeof(*c1))) {
 +                              /* Same location, different contents, delete
 +                               * the memory location.
 +                               */
 +                              bb_delete_memory(c1->offset_address);
 +                              KDB_DEBUG_BB_OFFSET(c2->offset_address,
 +                                                  "  ignoring c2->offset_address ",
 +                                                  "\n");
 +                              changed = 1;
 +                      }
 +                      ++c1;
 +                      ++c2;
 +                      ++j;
 +                      ++k;
 +              }
 +              while (j < bb_reg_state->mem_count) {
 +                      bb_delete_memory(c1->offset_address);
 +                      changed = 1;
 +                      ++c1;
 +                      ++j;
 +              }
 +      }
 +      if (changed) {
 +              KDB_DEBUG_BB("  final state\n");
 +              bb_reg_state_print(bb_reg_state);
 +      }
 +}
 +
 +/* We have reached the exit point from the current function, either a call to
 + * the next function or the instruction that was about to executed when an
 + * interrupt occurred.  Save the current register state in bb_exit_state.
 + */
 +
 +static void
 +bb_save_exit_state(void)
 +{
 +      size_t size;
 +      debug_kfree(bb_exit_state);
 +      bb_exit_state = NULL;
 +      bb_reg_state_canonicalize();
 +      size = bb_reg_state_size(bb_reg_state);
 +      bb_exit_state = debug_kmalloc(size, GFP_ATOMIC);
 +      if (!bb_exit_state) {
 +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
 +              bb_giveup = 1;
 +              return;
 +      }
 +      memcpy(bb_exit_state, bb_reg_state, size);
 +}
 +
 +static int
 +bb_pass2_do_changed_blocks(int allow_missing)
 +{
 +      int i, j, missing, changed, maxloops;
 +      unsigned long addr;
 +      struct bb_jmp *bb_jmp;
 +      KDB_DEBUG_BB("\n  %s: allow_missing %d\n", __FUNCTION__, allow_missing);
 +      /* Absolute worst case is we have to iterate over all the basic blocks
 +       * in an "out of order" state, each iteration losing one register or
 +       * memory state.  Any more loops than that is a bug.  "out of order"
 +       * means that the layout of blocks in memory does not match the logic
 +       * flow through those blocks so (for example) block 27 comes before
 +       * block 2.  To allow for out of order blocks, multiply maxloops by the
 +       * number of blocks.
 +       */
 +      maxloops = (KDB_INT_REGISTERS + bb_reg_state_max) * bb_count;
 +      changed = 1;
 +      do {
 +              changed = 0;
 +              for (i = 0; i < bb_count; ++i) {
 +                      bb_curr = bb_list[i];
 +                      if (!bb_curr->changed)
 +                              continue;
 +                      missing = 0;
 +                      for (j = 0, bb_jmp = bb_jmp_list;
 +                           j < bb_jmp_count;
 +                           ++j, ++bb_jmp) {
 +                              if (bb_jmp->to == bb_curr->start &&
 +                                  !bb_jmp->state)
 +                                      ++missing;
 +                      }
 +                      if (missing > allow_missing)
 +                              continue;
 +                      bb_curr->changed = 0;
 +                      changed = 1;
 +                      KDB_DEBUG_BB("\n  bb[%d]\n", i);
 +                      bb_pass2_start_block(i);
 +                      for (addr = bb_curr->start;
 +                           addr <= bb_curr->end; ) {
 +                              bb_curr_addr = addr;
 +                              if (addr == bb_exit_addr)
 +                                      bb_save_exit_state();
 +                              addr += kdba_id_printinsn(addr, &kdb_di);
 +                              kdb_di.fprintf_func(NULL, "\n");
 +                              if (bb_giveup)
 +                                      goto done;
 +                      }
 +                      if (!bb_exit_state) {
 +                              /* ATTRIB_NORET functions are a problem with
 +                               * the current gcc.  Allow the trailing address
 +                               * a bit of leaway.
 +                               */
 +                              if (addr == bb_exit_addr ||
 +                                  addr == bb_exit_addr + 1)
 +                                      bb_save_exit_state();
 +                      }
 +                      if (bb_curr->drop_through)
 +                              bb_transfer(bb_curr->end,
 +                                          bb_list[i+1]->start, 1);
 +              }
 +              if (maxloops-- == 0) {
 +                      kdb_printf("\n\n%s maxloops reached\n",
 +                                 __FUNCTION__);
 +                      bb_giveup = 1;
 +                      goto done;
 +              }
 +      } while(changed);
 +done:
 +      for (i = 0; i < bb_count; ++i) {
 +              bb_curr = bb_list[i];
 +              if (bb_curr->changed)
 +                      return 1;       /* more to do, increase allow_missing */
 +      }
 +      return 0;       /* all blocks done */
 +}
 +
 +/* Assume that the current function is a pass through function that does not
 + * refer to its register parameters.  Exclude known asmlinkage functions and
 + * assume the other functions actually use their registers.
 + */
 +
 +static void
 +bb_assume_pass_through(void)
 +{
 +      static int first_time = 1;
 +      if (strncmp(bb_func_name, "sys_", 4) == 0 ||
 +          strncmp(bb_func_name, "compat_sys_", 11) == 0 ||
 +          strcmp(bb_func_name, "schedule") == 0 ||
 +          strcmp(bb_func_name, "do_softirq") == 0 ||
 +          strcmp(bb_func_name, "printk") == 0 ||
 +          strcmp(bb_func_name, "vprintk") == 0 ||
 +          strcmp(bb_func_name, "preempt_schedule") == 0 ||
 +          strcmp(bb_func_name, "start_kernel") == 0 ||
 +          strcmp(bb_func_name, "csum_partial") == 0 ||
 +          strcmp(bb_func_name, "csum_partial_copy_generic") == 0 ||
 +          strcmp(bb_func_name, "math_state_restore") == 0 ||
 +          strcmp(bb_func_name, "panic") == 0 ||
 +          strcmp(bb_func_name, "kdb_printf") == 0 ||
 +          strcmp(bb_func_name, "kdb_interrupt") == 0)
 +              return;
 +      if (bb_asmlinkage_arch())
 +              return;
 +      bb_reg_params = REGPARM;
 +      if (first_time) {
 +              kdb_printf("  %s has memory parameters but no register "
 +                         "parameters.\n  Assuming it is a 'pass "
 +                         "through' function that does not refer to "
 +                         "its register\n  parameters and setting %d "
 +                         "register parameters\n",
 +                         bb_func_name, REGPARM);
 +              first_time = 0;
 +              return;
 +      }
 +      kdb_printf("  Assuming %s is 'pass through' with %d register "
 +                 "parameters\n",
 +                 bb_func_name, REGPARM);
 +}
 +
 +static void
 +bb_pass2(void)
 +{
 +      int allow_missing;
 +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
 +              kdb_printf("%s: start\n", __FUNCTION__);
 +
 +      kdb_di.fprintf_func = bb_dis_pass2;
 +      kdb_di.print_address_func = bb_printaddr_pass2;
 +
 +      bb_reg_state = debug_kmalloc(sizeof(*bb_reg_state), GFP_ATOMIC);
 +      if (!bb_reg_state) {
 +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
 +              bb_giveup = 1;
 +              return;
 +      }
 +      bb_list[0]->changed = 1;
 +
 +      /* If a block does not have all its input states available then it is
 +       * possible for a register to initially appear to hold a known value,
 +       * but when other inputs are available then it becomes a variable
 +       * value.  The initial false state of "known" can generate false values
 +       * for other registers and can even make it look like stack locations
 +       * are being changed.
 +       *
 +       * To avoid these false positives, only process blocks which have all
 +       * their inputs defined.  That gives a clean depth first traversal of
 +       * the tree, except for loops.  If there are any loops, then start
 +       * processing blocks with one missing input, then two missing inputs
 +       * etc.
 +       *
 +       * Absolute worst case is we have to iterate over all the jmp entries,
 +       * each iteration allowing one more missing input.  Any more loops than
 +       * that is a bug.  Watch out for the corner case of 0 jmp entries.
 +       */
 +      for (allow_missing = 0; allow_missing <= bb_jmp_count; ++allow_missing) {
 +              if (!bb_pass2_do_changed_blocks(allow_missing))
 +                      break;
 +              if (bb_giveup)
 +                      break;
 +      }
 +      if (allow_missing > bb_jmp_count) {
 +              kdb_printf("\n\n%s maxloops reached\n",
 +                         __FUNCTION__);
 +              bb_giveup = 1;
 +              return;
 +      }
 +
 +      if (bb_memory_params && bb_reg_params)
 +              bb_reg_params = REGPARM;
 +      if (REGPARM &&
 +          bb_memory_params &&
 +          !bb_reg_params)
 +              bb_assume_pass_through();
 +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
 +              kdb_printf("%s: end bb_reg_params %d bb_memory_params %d\n",
 +                         __FUNCTION__, bb_reg_params, bb_memory_params);
 +              if (bb_exit_state) {
 +                      kdb_printf("%s: bb_exit_state at " kdb_bfd_vma_fmt0 "\n",
 +                                 __FUNCTION__, bb_exit_addr);
 +                      bb_do_reg_state_print(bb_exit_state);
 +              }
 +      }
 +}
 +
 +static void
 +bb_cleanup(void)
 +{
 +      int i;
 +      struct bb* bb;
 +      struct bb_reg_state *state;
 +      while (bb_count) {
 +              bb = bb_list[0];
 +              bb_delete(0);
 +      }
 +      debug_kfree(bb_list);
 +      bb_list = NULL;
 +      bb_count = bb_max = 0;
 +      for (i = 0; i < bb_jmp_count; ++i) {
 +              state = bb_jmp_list[i].state;
 +              if (state && --state->ref_count == 0)
 +                      debug_kfree(state);
 +      }
 +      debug_kfree(bb_jmp_list);
 +      bb_jmp_list = NULL;
 +      bb_jmp_count = bb_jmp_max = 0;
 +      debug_kfree(bb_reg_state);
 +      bb_reg_state = NULL;
 +      bb_reg_state_max = 0;
 +      debug_kfree(bb_exit_state);
 +      bb_exit_state = NULL;
 +      bb_reg_params = bb_memory_params = 0;
 +      bb_giveup = 0;
 +}
 +
 +static int
 +bb_spurious_global_label(const char *func_name)
 +{
 +      int i;
 +      for (i = 0; i < ARRAY_SIZE(bb_spurious); ++i) {
 +              if (strcmp(bb_spurious[i], func_name) == 0)
 +                      return 1;
 +      }
 +      return 0;
 +}
 +
 +/* Given the current actual register contents plus the exit state deduced from
 + * a basic block analysis of the current function, rollback the actual register
 + * contents to the values they had on entry to this function.
 + */
 +
 +static void
 +bb_actual_rollback(const struct kdb_activation_record *ar)
 +{
 +      int i, offset_address;
 +      struct bb_memory_contains *c;
 +      enum bb_reg_code reg;
 +      unsigned long address, osp = 0;
 +      struct bb_actual new[ARRAY_SIZE(bb_actual)];
 +
 +
 +      if (!bb_exit_state) {
 +              kdb_printf("%s: no bb_exit_state, cannot rollback\n",
 +                         __FUNCTION__);
 +              bb_giveup = 1;
 +              return;
 +      }
 +      memcpy(bb_reg_state, bb_exit_state, bb_reg_state_size(bb_exit_state));
 +      memset(new, 0, sizeof(new));
 +
 +      /* The most important register for obtaining saved state is rsp so get
 +       * its new value first.  Prefer rsp if it is valid, then other
 +       * registers.  Saved values of rsp in memory are unusable without a
 +       * register that points to memory.
 +       */
 +      if (!bb_actual_valid(BBRG_RSP)) {
 +              kdb_printf("%s: no starting value for RSP, cannot rollback\n",
 +                         __FUNCTION__);
 +              bb_giveup = 1;
 +              return;
 +      }
 +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
 +              kdb_printf("%s: rsp " kdb_bfd_vma_fmt0,
 +                         __FUNCTION__, bb_actual_value(BBRG_RSP));
 +      i = BBRG_RSP;
 +      if (!bb_is_osp_defined(i)) {
 +              for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
 +                      if (bb_is_osp_defined(i) && bb_actual_valid(i))
 +                              break;
 +              }
 +      }
 +      if (bb_is_osp_defined(i) && bb_actual_valid(i)) {
 +              osp = new[BBRG_RSP - BBRG_RAX].value =
 +                    bb_actual_value(i) - bb_reg_code_offset(i);
 +              new[BBRG_RSP - BBRG_RAX].valid = 1;
 +              if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
 +                      kdb_printf(" -> osp " kdb_bfd_vma_fmt0 "\n", osp);
 +      } else {
 +              bb_actual_set_valid(BBRG_RSP, 0);
 +              if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
 +                      kdb_printf(" -> undefined\n");
 +              kdb_printf("%s: no ending value for RSP, cannot rollback\n",
 +                         __FUNCTION__);
 +              bb_giveup = 1;
 +              return;
 +      }
 +
 +      /* Now the other registers.  First look at register values that have
 +       * been copied to other registers.
 +       */
 +      for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
 +              reg = bb_reg_code_value(i);
 +              if (bb_is_int_reg(reg)) {
 +                      new[reg - BBRG_RAX] = bb_actual[i - BBRG_RAX];
 +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
 +                              kdb_printf("%s: %s is in %s ",
 +                                          __FUNCTION__,
 +                                          bbrg_name[reg],
 +                                          bbrg_name[i]);
 +                              if (bb_actual_valid(i))
 +                                      kdb_printf(" -> " kdb_bfd_vma_fmt0 "\n",
 +                                                  bb_actual_value(i));
 +                              else
 +                                      kdb_printf("(invalid)\n");
 +                      }
 +              }
 +      }
 +
 +      /* Finally register values that have been saved on stack */
 +      for (i = 0, c = bb_reg_state->memory;
 +           i < bb_reg_state->mem_count;
 +           ++i, ++c) {
 +              offset_address = c->offset_address;
 +              reg = c->value;
 +              if (!bb_is_int_reg(reg))
 +                      continue;
 +              address = osp + offset_address;
 +              if (address < ar->stack.logical_start ||
 +                  address >= ar->stack.logical_end) {
 +                      new[reg - BBRG_RAX].value = 0;
 +                      new[reg - BBRG_RAX].valid = 0;
 +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
 +                              kdb_printf("%s: %s -> undefined\n",
 +                                         __FUNCTION__,
 +                                         bbrg_name[reg]);
 +              } else {
 +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
 +                              kdb_printf("%s: %s -> *(osp",
 +                                         __FUNCTION__,
 +                                         bbrg_name[reg]);
 +                              KDB_DEBUG_BB_OFFSET_PRINTF(offset_address, "", " ");
 +                              kdb_printf(kdb_bfd_vma_fmt0, address);
 +                      }
 +                      new[reg - BBRG_RAX].value = *(bfd_vma *)address;
 +                      new[reg - BBRG_RAX].valid = 1;
 +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
 +                              kdb_printf(") = " kdb_bfd_vma_fmt0 "\n",
 +                                         new[reg - BBRG_RAX].value);
 +              }
 +      }
 +
 +      memcpy(bb_actual, new, sizeof(bb_actual));
 +}
 +
 +/* Return true if the current function is an interrupt handler */
 +
 +static bool
 +bb_interrupt_handler(kdb_machreg_t rip)
 +{
 +      unsigned long disp8, disp32, target, addr = (unsigned long)rip;
 +      unsigned char code[5];
 +      int i;
 +
 +      for (i = 0; i < ARRAY_SIZE(bb_hardware_handlers); ++i)
 +              if (strcmp(bb_func_name, bb_hardware_handlers[i]) == 0)
 +                      return 1;
 +
 +      /* Given the large number of interrupt handlers, it is easiest to look
 +       * at the next instruction and see if it is a jmp to the common exit
 +       * routines.
 +       */
 +      if (kdb_getarea(code, addr) ||
 +          kdb_getword(&disp32, addr+1, 4) ||
 +          kdb_getword(&disp8, addr+1, 1))
 +              return 0;       /* not a valid code address */
 +      if (code[0] == 0xe9) {
 +              target = addr + (s32) disp32 + 5;       /* jmp disp32 */
 +              if (target == bb_ret_from_intr ||
 +                  target == bb_common_interrupt ||
 +                  target == bb_error_entry)
 +                      return 1;
 +      }
 +      if (code[0] == 0xeb) {
 +              target = addr + (s8) disp8 + 2;         /* jmp disp8 */
 +              if (target == bb_ret_from_intr ||
 +                  target == bb_common_interrupt ||
 +                  target == bb_error_entry)
 +                      return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Copy argument information that was deduced by the basic block analysis and
 + * rollback into the kdb stack activation record.
 + */
 +
 +static void
 +bb_arguments(struct kdb_activation_record *ar)
 +{
 +      int i;
 +      enum bb_reg_code reg;
 +      kdb_machreg_t rsp;
 +      ar->args = bb_reg_params + bb_memory_params;
 +      bitmap_zero(ar->valid.bits, KDBA_MAXARGS);
 +      for (i = 0; i < bb_reg_params; ++i) {
 +              reg = bb_param_reg[i];
 +              if (bb_actual_valid(reg)) {
 +                      ar->arg[i] = bb_actual_value(reg);
 +                      set_bit(i, ar->valid.bits);
 +              }
 +      }
 +      if (!bb_actual_valid(BBRG_RSP))
 +              return;
 +      rsp = bb_actual_value(BBRG_RSP);
 +      for (i = bb_reg_params; i < ar->args; ++i) {
 +              rsp += KDB_WORD_SIZE;
 +              if (kdb_getarea(ar->arg[i], rsp) == 0)
 +                      set_bit(i, ar->valid.bits);
 +      }
 +}
 +
 +/* Given an exit address from a function, decompose the entire function into
 + * basic blocks and determine the register state at the exit point.
 + */
 +
 +static void
 +kdb_bb(unsigned long exit)
 +{
 +      kdb_symtab_t symtab;
 +      if (!kdbnearsym(exit, &symtab)) {
 +              kdb_printf("%s: address " kdb_bfd_vma_fmt0 " not recognised\n",
 +                         __FUNCTION__, exit);
 +              bb_giveup = 1;
 +              return;
 +      }
 +      bb_exit_addr = exit;
 +      bb_mod_name = symtab.mod_name;
 +      bb_func_name = symtab.sym_name;
 +      bb_func_start = symtab.sym_start;
 +      bb_func_end = symtab.sym_end;
 +      /* Various global labels exist in the middle of assembler code and have
 +       * a non-standard state.  Ignore these labels and use the start of the
 +       * previous label instead.
 +       */
 +      while (bb_spurious_global_label(symtab.sym_name)) {
 +              if (!kdbnearsym(symtab.sym_start - 1, &symtab))
 +                      break;
 +              bb_func_start = symtab.sym_start;
 +      }
 +      bb_mod_name = symtab.mod_name;
 +      bb_func_name = symtab.sym_name;
 +      bb_func_start = symtab.sym_start;
 +      /* Ignore spurious labels past this point and use the next non-spurious
 +       * label as the end point.
 +       */
 +      if (kdbnearsym(bb_func_end, &symtab)) {
 +              while (bb_spurious_global_label(symtab.sym_name)) {
 +                      bb_func_end = symtab.sym_end;
 +                      if (!kdbnearsym(symtab.sym_end + 1, &symtab))
 +                              break;
 +              }
 +      }
 +      bb_pass1();
 +      if (!bb_giveup)
 +              bb_pass2();
 +      if (bb_giveup)
 +              kdb_printf("%s: " kdb_bfd_vma_fmt0
 +                         " [%s]%s failed at " kdb_bfd_vma_fmt0 "\n\n",
 +                         __FUNCTION__, exit,
 +                         bb_mod_name, bb_func_name, bb_curr_addr);
 +}
 +
 +static int
 +kdb_bb1(int argc, const char **argv)
 +{
 +      int diag, nextarg = 1;
 +      kdb_machreg_t addr;
 +      unsigned long offset;
 +
 +      bb_cleanup();   /* in case previous command was interrupted */
 +      kdba_id_init(&kdb_di);
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +      diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
 +      if (diag)
 +              return diag;
 +      if (!addr)
 +              return KDB_BADADDR;
 +      kdb_save_flags();
 +      kdb_flags |= KDB_DEBUG_FLAG_BB << KDB_DEBUG_FLAG_SHIFT;
 +      kdb_bb(addr);
 +      bb_cleanup();
 +      kdb_restore_flags();
 +      kdbnearsym_cleanup();
 +      return 0;
 +}
 +
 +/* Run a basic block analysis on every function in the base kernel.  Used as a
 + * global sanity check to find errors in the basic block code.
 + */
 +
 +static int
 +kdb_bb_all(int argc, const char **argv)
 +{
 +      loff_t pos = 0;
 +      const char *symname;
 +      unsigned long addr;
 +      int i, max_errors = 20;
 +      struct bb_name_state *r;
 +      kdb_printf("%s: build variables:"
 +                 " CCVERSION \"" __stringify(CCVERSION) "\""
 +#ifdef        CONFIG_X86_64
 +                 " CONFIG_X86_64"
 +#endif
 +#ifdef        CONFIG_4KSTACKS
 +                 " CONFIG_4KSTACKS"
 +#endif
 +#ifdef        CONFIG_PREEMPT
 +                 " CONFIG_PREEMPT"
 +#endif
 +#ifdef        CONFIG_VM86
 +                 " CONFIG_VM86"
 +#endif
 +#ifdef        CONFIG_FRAME_POINTER
 +                 " CONFIG_FRAME_POINTER"
 +#endif
 +#ifdef        CONFIG_TRACE_IRQFLAGS
 +                 " CONFIG_TRACE_IRQFLAGS"
 +#endif
 +#ifdef        CONFIG_HIBERNATION
 +                 " CONFIG_HIBERNATION"
 +#endif
 +#ifdef        CONFIG_KPROBES
 +                 " CONFIG_KPROBES"
 +#endif
 +#ifdef        CONFIG_KEXEC
 +                 " CONFIG_KEXEC"
 +#endif
 +#ifdef        CONFIG_MATH_EMULATION
 +                 " CONFIG_MATH_EMULATION"
 +#endif
- #ifdef        CONFIG_PARAVIRT_XEN
++#ifdef        CONFIG_XEN
 +                 " CONFIG_XEN"
 +#endif
 +#ifdef        CONFIG_DEBUG_INFO
 +                 " CONFIG_DEBUG_INFO"
 +#endif
 +#ifdef        NO_SIBLINGS
 +                 " NO_SIBLINGS"
 +#endif
 +                 " REGPARM=" __stringify(REGPARM)
 +                 "\n\n", __FUNCTION__);
 +      for (i = 0, r = bb_special_cases;
 +           i < ARRAY_SIZE(bb_special_cases);
 +           ++i, ++r) {
 +              if (!r->address)
 +                      kdb_printf("%s: cannot find special_case name %s\n",
 +                                 __FUNCTION__, r->name);
 +      }
 +      for (i = 0; i < ARRAY_SIZE(bb_spurious); ++i) {
 +              if (!kallsyms_lookup_name(bb_spurious[i]))
 +                      kdb_printf("%s: cannot find spurious label %s\n",
 +                                 __FUNCTION__, bb_spurious[i]);
 +      }
 +      while ((symname = kdb_walk_kallsyms(&pos))) {
 +              if (strcmp(symname, "_stext") == 0 ||
 +                  strcmp(symname, "stext") == 0)
 +                      break;
 +      }
 +      if (!symname) {
 +              kdb_printf("%s: cannot find _stext\n", __FUNCTION__);
 +              return 0;
 +      }
 +      kdba_id_init(&kdb_di);
 +      i = 0;
 +      while ((symname = kdb_walk_kallsyms(&pos))) {
 +              if (strcmp(symname, "_etext") == 0)
 +                      break;
 +              if (i++ % 100 == 0)
 +                      kdb_printf(".");
 +              /* x86_64 has some 16 bit functions that appear between stext
 +               * and _etext.  Skip them.
 +               */
 +              if (strcmp(symname, "verify_cpu") == 0 ||
 +                  strcmp(symname, "verify_cpu_noamd") == 0 ||
 +                  strcmp(symname, "verify_cpu_sse_test") == 0 ||
 +                  strcmp(symname, "verify_cpu_no_longmode") == 0 ||
 +                  strcmp(symname, "verify_cpu_sse_ok") == 0 ||
 +                  strcmp(symname, "mode_seta") == 0 ||
 +                  strcmp(symname, "bad_address") == 0 ||
 +                  strcmp(symname, "wakeup_code") == 0 ||
 +                  strcmp(symname, "wakeup_code_start") == 0 ||
 +                  strcmp(symname, "wakeup_start") == 0 ||
 +                  strcmp(symname, "wakeup_32_vector") == 0 ||
 +                  strcmp(symname, "wakeup_32") == 0 ||
 +                  strcmp(symname, "wakeup_long64_vector") == 0 ||
 +                  strcmp(symname, "wakeup_long64") == 0 ||
 +                  strcmp(symname, "gdta") == 0 ||
 +                  strcmp(symname, "idt_48a") == 0 ||
 +                  strcmp(symname, "gdt_48a") == 0 ||
 +                  strcmp(symname, "bogus_real_magic") == 0 ||
 +                  strcmp(symname, "bogus_64_magic") == 0 ||
 +                  strcmp(symname, "no_longmode") == 0 ||
 +                  strcmp(symname, "mode_set") == 0 ||
 +                  strcmp(symname, "mode_seta") == 0 ||
 +                  strcmp(symname, "setbada") == 0 ||
 +                  strcmp(symname, "check_vesa") == 0 ||
 +                  strcmp(symname, "check_vesaa") == 0 ||
 +                  strcmp(symname, "_setbada") == 0 ||
 +                  strcmp(symname, "wakeup_stack_begin") == 0 ||
 +                  strcmp(symname, "wakeup_stack") == 0 ||
 +                  strcmp(symname, "wakeup_level4_pgt") == 0 ||
 +                  strcmp(symname, "acpi_copy_wakeup_routine") == 0 ||
 +                  strcmp(symname, "wakeup_end") == 0 ||
 +                  strcmp(symname, "do_suspend_lowlevel_s4bios") == 0 ||
 +                  strcmp(symname, "do_suspend_lowlevel") == 0 ||
 +                  strcmp(symname, "wakeup_pmode_return") == 0 ||
 +                  strcmp(symname, "restore_registers") == 0)
 +                      continue;
 +              /* __kprobes_text_end contains branches to the middle of code,
 +               * with undefined states.
 +               */
 +              if (strcmp(symname, "__kprobes_text_end") == 0)
 +                      continue;
 +              /* Data in the middle of the text segment :( */
 +              if (strcmp(symname, "level2_kernel_pgt") == 0 ||
 +                  strcmp(symname, "level3_kernel_pgt") == 0)
 +                      continue;
 +              if (bb_spurious_global_label(symname))
 +                      continue;
 +              if ((addr = kallsyms_lookup_name(symname)) == 0)
 +                      continue;
 +              // kdb_printf("BB " kdb_bfd_vma_fmt0 " %s\n", addr, symname);
 +              bb_cleanup();   /* in case previous command was interrupted */
 +              kdbnearsym_cleanup();
 +              kdb_bb(addr);
 +              touch_nmi_watchdog();
 +              if (bb_giveup) {
 +                      if (max_errors-- == 0) {
 +                              kdb_printf("%s: max_errors reached, giving up\n",
 +                                         __FUNCTION__);
 +                              break;
 +                      } else {
 +                              bb_giveup = 0;
 +                      }
 +              }
 +      }
 +      kdb_printf("\n");
 +      bb_cleanup();
 +      kdbnearsym_cleanup();
 +      return 0;
 +}
 +
 +/*
 + *=============================================================================
 + *
 + * Everything above this line is doing basic block analysis, function by
 + * function.  Everything below this line uses the basic block data to do a
 + * complete backtrace over all functions that are used by a process.
 + *
 + *=============================================================================
 + */
 +
 +
 +/*============================================================================*/
 +/*                                                                            */
 +/* Most of the backtrace code and data is common to x86_64 and i386.  This    */
 +/* large ifdef contains all of the differences between the two architectures. */
 +/*                                                                            */
 +/* Make sure you update the correct section of this ifdef.                    */
 +/*                                                                            */
 +/*============================================================================*/
 +#define XCS "cs"
 +#define RSP "sp"
 +#define RIP "ip"
 +#define ARCH_RSP sp
 +#define ARCH_RIP ip
 +
 +#ifdef        CONFIG_X86_64
 +
 +#define ARCH_NORMAL_PADDING (16 * 8)
 +
 +/* x86_64 has multiple alternate stacks, with different sizes and different
 + * offsets to get the link from one stack to the next.  All of the stacks are
 + * in the per_cpu area: either in the orig_ist or irq_stack_ptr. Debug events
 + * can even have multiple nested stacks within the single physical stack,
 + * each nested stack has its own link and some of those links are wrong.
 + *
 + * Consistent it's not!
 + *
 + * Do not assume that these stacks are aligned on their size.
 + */
 +#define INTERRUPT_STACK (N_EXCEPTION_STACKS + 1)
 +void
 +kdba_get_stack_info_alternate(kdb_machreg_t addr, int cpu,
 +                            struct kdb_activation_record *ar)
 +{
 +      static struct {
 +              const char *id;
 +              unsigned int total_size;
 +              unsigned int nested_size;
 +              unsigned int next;
 +      } *sdp, stack_data[] = {
 +              [STACKFAULT_STACK - 1] =  { "stackfault",    EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
 +              [DOUBLEFAULT_STACK - 1] = { "doublefault",   EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
 +              [NMI_STACK - 1] =         { "nmi",           EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
 +              [DEBUG_STACK - 1] =       { "debug",         DEBUG_STKSZ,     EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
 +              [MCE_STACK - 1] =         { "machine check", EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
 +              [INTERRUPT_STACK - 1] =   { "interrupt",     IRQ_STACK_SIZE,  IRQ_STACK_SIZE,  IRQ_STACK_SIZE  -   sizeof(void *) },
 +      };
 +      unsigned long total_start = 0, total_size, total_end;
 +      int sd, found = 0;
 +      extern unsigned long kdba_orig_ist(int, int);
 +
 +      for (sd = 0, sdp = stack_data;
 +           sd < ARRAY_SIZE(stack_data);
 +           ++sd, ++sdp) {
 +              total_size = sdp->total_size;
 +              if (!total_size)
 +                      continue;       /* in case stack_data[] has any holes */
 +              if (cpu < 0) {
 +                      /* Arbitrary address which can be on any cpu, see if it
 +                       * falls within any of the alternate stacks
 +                       */
 +                      int c;
 +                      for_each_online_cpu(c) {
 +                              if (sd == INTERRUPT_STACK - 1)
 +                                      total_end = (unsigned long)per_cpu(irq_stack_ptr, c);
 +                              else
 +                                      total_end = per_cpu(orig_ist, c).ist[sd];
 +                              total_start = total_end - total_size;
 +                              if (addr >= total_start && addr < total_end) {
 +                                      found = 1;
 +                                      cpu = c;
 +                                      break;
 +                              }
 +                      }
 +                      if (!found)
 +                              continue;
 +              }
 +              /* Only check the supplied or found cpu */
 +              if (sd == INTERRUPT_STACK - 1)
 +                      total_end = (unsigned long)per_cpu(irq_stack_ptr, cpu);
 +              else
 +                      total_end = per_cpu(orig_ist, cpu).ist[sd];
 +              total_start = total_end - total_size;
 +              if (addr >= total_start && addr < total_end) {
 +                      found = 1;
 +                      break;
 +              }
 +      }
 +      if (!found)
 +              return;
 +      /* find which nested stack the address is in */
 +      while (addr > total_start + sdp->nested_size)
 +              total_start += sdp->nested_size;
 +      ar->stack.physical_start = total_start;
 +      ar->stack.physical_end = total_start + sdp->nested_size;
 +      ar->stack.logical_start = total_start;
 +      ar->stack.logical_end = total_start + sdp->next;
 +      ar->stack.next = *(unsigned long *)ar->stack.logical_end;
 +      ar->stack.id = sdp->id;
 +
 +      /* Nasty: when switching to the interrupt stack, the stack state of the
 +       * caller is split over two stacks, the original stack and the
 +       * interrupt stack.  One word (the previous frame pointer) is stored on
 +       * the interrupt stack, the rest of the interrupt data is in the old
 +       * frame.  To make the interrupted stack state look as though it is
 +       * contiguous, copy the missing word from the interrupt stack to the
 +       * original stack and adjust the new stack pointer accordingly.
 +       */
 +
 +      if (sd == INTERRUPT_STACK - 1) {
 +              *(unsigned long *)(ar->stack.next - KDB_WORD_SIZE) =
 +                      ar->stack.next;
 +              ar->stack.next -= KDB_WORD_SIZE;
 +      }
 +}
 +
 +/* rip is not in the thread struct for x86_64.  We know that the stack value
 + * was saved in schedule near the label thread_return.  Setting rip to
 + * thread_return lets the stack trace find that we are in schedule and
 + * correctly decode its prologue.
 + */
 +
 +static kdb_machreg_t
 +kdba_bt_stack_rip(const struct task_struct *p)
 +{
 +      return bb_thread_return;
 +}
 +
 +#else /* !CONFIG_X86_64 */
 +
 +#define ARCH_NORMAL_PADDING (19 * 4)
 +
 +#ifdef        CONFIG_4KSTACKS
 +static struct thread_info **kdba_hardirq_ctx, **kdba_softirq_ctx;
 +#endif        /* CONFIG_4KSTACKS */
 +
 +/* On a 4K stack kernel, hardirq_ctx and softirq_ctx are [NR_CPUS] arrays.  The
 + * first element of each per-cpu stack is a struct thread_info.
 + */
 +void
 +kdba_get_stack_info_alternate(kdb_machreg_t addr, int cpu,
 +                            struct kdb_activation_record *ar)
 +{
 +#ifdef        CONFIG_4KSTACKS
 +      struct thread_info *tinfo;
 +      tinfo = (struct thread_info *)(addr & -THREAD_SIZE);
 +      if (cpu < 0) {
 +              /* Arbitrary address, see if it falls within any of the irq
 +               * stacks
 +               */
 +              int found = 0;
 +              for_each_online_cpu(cpu) {
 +                      if (tinfo == kdba_hardirq_ctx[cpu] ||
 +                          tinfo == kdba_softirq_ctx[cpu]) {
 +                              found = 1;
 +                              break;
 +                      }
 +              }
 +              if (!found)
 +                      return;
 +      }
 +      if (tinfo == kdba_hardirq_ctx[cpu] ||
 +          tinfo == kdba_softirq_ctx[cpu]) {
 +              ar->stack.physical_start = (kdb_machreg_t)tinfo;
 +              ar->stack.physical_end = ar->stack.physical_start + THREAD_SIZE;
 +              ar->stack.logical_start = ar->stack.physical_start +
 +                                        sizeof(struct thread_info);
 +              ar->stack.logical_end = ar->stack.physical_end;
 +              ar->stack.next = tinfo->previous_esp;
 +              if (tinfo == kdba_hardirq_ctx[cpu])
 +                      ar->stack.id = "hardirq_ctx";
 +              else
 +                      ar->stack.id = "softirq_ctx";
 +      }
 +#endif        /* CONFIG_4KSTACKS */
 +}
 +
 +/* rip is in the thread struct for i386 */
 +
 +static kdb_machreg_t
 +kdba_bt_stack_rip(const struct task_struct *p)
 +{
 +      return p->thread.ip;
 +}
 +
 +#endif        /* CONFIG_X86_64 */
 +
 +/* Given an address which claims to be on a stack, an optional cpu number and
 + * an optional task address, get information about the stack.
 + *
 + * t == NULL, cpu < 0 indicates an arbitrary stack address with no associated
 + * struct task, the address can be in an alternate stack or any task's normal
 + * stack.
 + *
 + * t != NULL, cpu >= 0 indicates a running task, the address can be in an
 + * alternate stack or that task's normal stack.
 + *
 + * t != NULL, cpu < 0 indicates a blocked task, the address can only be in that
 + * task's normal stack.
 + *
 + * t == NULL, cpu >= 0 is not a valid combination.
 + */
 +
 +static void
 +kdba_get_stack_info(kdb_machreg_t rsp, int cpu,
 +                  struct kdb_activation_record *ar,
 +                  const struct task_struct *t)
 +{
 +      struct thread_info *tinfo;
 +      struct task_struct *g, *p;
 +      memset(&ar->stack, 0, sizeof(ar->stack));
 +      if (KDB_DEBUG(ARA))
 +              kdb_printf("%s: " RSP "=0x%lx cpu=%d task=%p\n",
 +                         __FUNCTION__, rsp, cpu, t);
 +      if (t == NULL || cpu >= 0) {
 +              kdba_get_stack_info_alternate(rsp, cpu, ar);
 +              if (ar->stack.logical_start)
 +                      goto out;
 +      }
 +      rsp &= -THREAD_SIZE;
 +      tinfo = (struct thread_info *)rsp;
 +      if (t == NULL) {
 +              /* Arbitrary stack address without an associated task, see if
 +               * it falls within any normal process stack, including the idle
 +               * tasks.
 +               */
 +              kdb_do_each_thread(g, p) {
 +                      if (tinfo == task_thread_info(p)) {
 +                              t = p;
 +                              goto found;
 +                      }
 +              } kdb_while_each_thread(g, p);
 +              for_each_online_cpu(cpu) {
 +                      p = idle_task(cpu);
 +                      if (tinfo == task_thread_info(p)) {
 +                              t = p;
 +                              goto found;
 +                      }
 +              }
 +      found:
 +              if (KDB_DEBUG(ARA))
 +                      kdb_printf("%s: found task %p\n", __FUNCTION__, t);
 +      } else if (cpu >= 0) {
 +              /* running task */
 +              struct kdb_running_process *krp = kdb_running_process + cpu;
 +              if (krp->p != t || tinfo != task_thread_info(t))
 +                      t = NULL;
 +              if (KDB_DEBUG(ARA))
 +                      kdb_printf("%s: running task %p\n", __FUNCTION__, t);
 +      } else {
 +              /* blocked task */
 +              if (tinfo != task_thread_info(t))
 +                      t = NULL;
 +              if (KDB_DEBUG(ARA))
 +                      kdb_printf("%s: blocked task %p\n", __FUNCTION__, t);
 +      }
 +      if (t) {
 +              ar->stack.physical_start = rsp;
 +              ar->stack.physical_end = rsp + THREAD_SIZE;
 +              ar->stack.logical_start = rsp + sizeof(struct thread_info);
 +              ar->stack.logical_end = ar->stack.physical_end - ARCH_NORMAL_PADDING;
 +              ar->stack.next = 0;
 +              ar->stack.id = "normal";
 +      }
 +out:
 +      if (ar->stack.physical_start && KDB_DEBUG(ARA)) {
 +              kdb_printf("%s: ar->stack\n", __FUNCTION__);
 +              kdb_printf("    physical_start=0x%lx\n", ar->stack.physical_start);
 +              kdb_printf("    physical_end=0x%lx\n", ar->stack.physical_end);
 +              kdb_printf("    logical_start=0x%lx\n", ar->stack.logical_start);
 +              kdb_printf("    logical_end=0x%lx\n", ar->stack.logical_end);
 +              kdb_printf("    next=0x%lx\n", ar->stack.next);
 +              kdb_printf("    id=%s\n", ar->stack.id);
 +              kdb_printf("    set MDCOUNT %ld\n",
 +                         (ar->stack.physical_end - ar->stack.physical_start) /
 +                         KDB_WORD_SIZE);
 +              kdb_printf("    mds " kdb_machreg_fmt0 "\n",
 +                         ar->stack.physical_start);
 +      }
 +}
 +
 +static void
 +bt_print_one(kdb_machreg_t rip, kdb_machreg_t rsp,
 +            const struct kdb_activation_record *ar,
 +            const kdb_symtab_t *symtab, int argcount)
 +{
 +      int btsymarg = 0;
 +      int nosect = 0;
 +
 +      kdbgetintenv("BTSYMARG", &btsymarg);
 +      kdbgetintenv("NOSECT", &nosect);
 +
 +      kdb_printf(kdb_machreg_fmt0, rsp);
 +      kdb_symbol_print(rip, symtab,
 +                       KDB_SP_SPACEB|KDB_SP_VALUE);
 +      if (argcount && ar->args) {
 +              int i, argc = ar->args;
 +              kdb_printf(" (");
 +              if (argc > argcount)
 +                      argc = argcount;
 +              for (i = 0; i < argc; i++) {
 +                      if (i)
 +                              kdb_printf(", ");
 +                      if (test_bit(i, ar->valid.bits))
 +                              kdb_printf("0x%lx", ar->arg[i]);
 +                      else
 +                              kdb_printf("invalid");
 +              }
 +              kdb_printf(")");
 +      }
 +      kdb_printf("\n");
 +      if (symtab->sym_name) {
 +              if (!nosect) {
 +                      kdb_printf("                               %s",
 +                                 symtab->mod_name);
 +                      if (symtab->sec_name && symtab->sec_start)
 +                              kdb_printf(" 0x%lx 0x%lx",
 +                                         symtab->sec_start, symtab->sec_end);
 +                      kdb_printf(" 0x%lx 0x%lx\n",
 +                                 symtab->sym_start, symtab->sym_end);
 +              }
 +      }
 +      if (argcount && ar->args && btsymarg) {
 +              int i, argc = ar->args;
 +              kdb_symtab_t arg_symtab;
 +              for (i = 0; i < argc; i++) {
 +                      kdb_machreg_t arg = ar->arg[i];
 +                      if (test_bit(i, ar->valid.bits) &&
 +                          kdbnearsym(arg, &arg_symtab)) {
 +                              kdb_printf("                       ARG %2d ", i);
 +                              kdb_symbol_print(arg, &arg_symtab,
 +                                               KDB_SP_DEFAULT|KDB_SP_NEWLINE);
 +                      }
 +              }
 +      }
 +}
 +
 +static void
 +kdba_bt_new_stack(struct kdb_activation_record *ar, kdb_machreg_t *rsp,
 +                 int *count, int *suppress)
 +{
 +      /* Nasty: save_args builds a partial pt_regs, with r15 through
 +       * rbx not being filled in.  It passes struct pt_regs* to do_IRQ (in
 +       * rdi) but the stack pointer is not adjusted to account for r15
 +       * through rbx.  This has two effects :-
 +       *
 +       * (1) struct pt_regs on an external interrupt actually overlaps with
 +       *     the local stack area used by do_IRQ.  Not only are r15-rbx
 +       *     undefined, the area that claims to hold their values can even
 +       *     change as the irq is processed.
 +       *
 +       * (2) The back stack pointer saved for the new frame is not pointing
 +       *     at pt_regs, it is pointing at rbx within the pt_regs passed to
 +       *     do_IRQ.
 +       *
 +       * There is nothing that I can do about (1) but I have to fix (2)
 +       * because kdb backtrace looks for the "start" address of pt_regs as it
 +       * walks back through the stacks.  When switching from the interrupt
 +       * stack to another stack, we have to assume that pt_regs has been
 +       * seen and turn off backtrace supression.
 +       */
 +      int probable_pt_regs = strcmp(ar->stack.id, "interrupt") == 0;
 +      *rsp = ar->stack.next;
 +      if (KDB_DEBUG(ARA))
 +              kdb_printf("new " RSP "=" kdb_machreg_fmt0 "\n", *rsp);
 +      bb_actual_set_value(BBRG_RSP, *rsp);
 +      kdba_get_stack_info(*rsp, -1, ar, NULL);
 +      if (!ar->stack.physical_start) {
 +              kdb_printf("+++ Cannot resolve next stack\n");
 +      } else if (!*suppress) {
 +              kdb_printf(" ======================= <%s>\n",
 +                         ar->stack.id);
 +              ++*count;
 +      }
 +      if (probable_pt_regs)
 +              *suppress = 0;
 +}
 +
 +/*
 + * kdba_bt_stack
 + *
 + * Inputs:
 + *    addr    Address provided to 'bt' command, if any.
 + *    argcount
 + *    p       Pointer to task for 'btp' command.
 + * Outputs:
 + *    None.
 + * Returns:
 + *    zero for success, a kdb diagnostic if error
 + * Locking:
 + *    none.
 + * Remarks:
 + *    Ultimately all the bt* commands come through this routine.  If
 + *    old_style is 0 then it uses the basic block analysis to get an accurate
 + *    backtrace with arguments, otherwise it falls back to the old method of
 + *    printing anything on stack that looks like a kernel address.
 + *
 + *    Allowing for the stack data pushed by the hardware is tricky.  We
 + *    deduce the presence of hardware pushed data by looking for interrupt
 + *    handlers, either by name or by the code that they contain.  This
 + *    information must be applied to the next function up the stack, because
 + *    the hardware data is above the saved rip for the interrupted (next)
 + *    function.
 + *
 + *    To make things worse, the amount of data pushed is arch specific and
 + *    may depend on the rsp for the next function, not the current function.
 + *    The number of bytes pushed by hardware cannot be calculated until we
 + *    are actually processing the stack for the interrupted function and have
 + *    its rsp.
 + *
 + *    It is also possible for an interrupt to occur in user space and for the
 + *    interrupt handler to also be interrupted.  Check the code selector
 + *    whenever the previous function is an interrupt handler and stop
 + *    backtracing if the interrupt was not in kernel space.
 + */
 +
 +static int
 +kdba_bt_stack(kdb_machreg_t addr, int argcount, const struct task_struct *p,
 +             int old_style)
 +{
 +      struct kdb_activation_record ar;
 +      kdb_machreg_t rip = 0, rsp = 0, prev_rsp, cs;
 +      kdb_symtab_t symtab;
 +      int rip_at_rsp = 0, count = 0, btsp = 0, suppress,
 +          interrupt_handler = 0, prev_interrupt_handler = 0, hardware_pushed,
 +          prev_noret = 0;
 +      struct pt_regs *regs = NULL;
 +
 +      kdbgetintenv("BTSP", &btsp);
 +      suppress = !btsp;
 +      memset(&ar, 0, sizeof(ar));
 +      if (old_style)
 +              kdb_printf("Using old style backtrace, unreliable with no arguments\n");
 +
 +      /*
 +       * The caller may have supplied an address at which the stack traceback
 +       * operation should begin.  This address is assumed by this code to
 +       * point to a return address on the stack to be traced back.
 +       *
 +       * Warning: type in the wrong address and you will get garbage in the
 +       * backtrace.
 +       */
 +      if (addr) {
 +              rsp = addr;
 +              kdb_getword(&rip, rsp, sizeof(rip));
 +              rip_at_rsp = 1;
 +              suppress = 0;
 +              kdba_get_stack_info(rsp, -1, &ar, NULL);
 +      } else {
 +              if (task_curr(p)) {
 +                      struct kdb_running_process *krp =
 +                          kdb_running_process + task_cpu(p);
 +                      kdb_machreg_t cs;
 +                      regs = krp->regs;
 +                      if (krp->seqno &&
 +                          krp->p == p &&
 +                          krp->seqno >= kdb_seqno - 1 &&
 +                          !KDB_NULL_REGS(regs)) {
 +                              /* valid saved state, continue processing */
 +                      } else {
 +                              kdb_printf
 +                                  ("Process did not save state, cannot backtrace\n");
 +                              kdb_ps1(p);
 +                              return 0;
 +                      }
 +                      kdba_getregcontents(XCS, regs, &cs);
 +                      if ((cs & 0xffff) != __KERNEL_CS) {
 +                              kdb_printf("Stack is not in kernel space, backtrace not available\n");
 +                              return 0;
 +                      }
 +                      rip = krp->arch.ARCH_RIP;
 +                      rsp = krp->arch.ARCH_RSP;
 +                      kdba_get_stack_info(rsp, kdb_process_cpu(p), &ar, p);
 +              } else {
 +                      /* Not on cpu, assume blocked.  Blocked tasks do not
 +                       * have pt_regs.  p->thread contains some data, alas
 +                       * what it contains differs between i386 and x86_64.
 +                       */
 +                      rip = kdba_bt_stack_rip(p);
 +                      rsp = p->thread.sp;
 +                      suppress = 0;
 +                      kdba_get_stack_info(rsp, -1, &ar, p);
 +              }
 +      }
 +      if (!ar.stack.physical_start) {
 +              kdb_printf(RSP "=0x%lx is not in a valid kernel stack, backtrace not available\n",
 +                         rsp);
 +              return 0;
 +      }
 +      memset(&bb_actual, 0, sizeof(bb_actual));
 +      bb_actual_set_value(BBRG_RSP, rsp);
 +      bb_actual_set_valid(BBRG_RSP, 1);
 +
 +      kdb_printf(RSP "%*s" RIP "%*sFunction (args)\n",
 +                 2*KDB_WORD_SIZE, " ",
 +                 2*KDB_WORD_SIZE, " ");
 +      if (ar.stack.next && !suppress)
 +              kdb_printf(" ======================= <%s>\n",
 +                         ar.stack.id);
 +
 +      bb_cleanup();
 +      /* Run through all the stacks */
 +      while (ar.stack.physical_start) {
 +              if (rip_at_rsp) {
 +                      rip = *(kdb_machreg_t *)rsp;
 +                      /* I wish that gcc was fixed to include a nop
 +                       * instruction after ATTRIB_NORET functions.  The lack
 +                       * of a nop means that the return address points to the
 +                       * start of next function, so fudge it to point to one
 +                       * byte previous.
 +                       *
 +                       * No, we cannot just decrement all rip values.
 +                       * Sometimes an rip legally points to the start of a
 +                       * function, e.g. interrupted code or hand crafted
 +                       * assembler.
 +                       */
 +                      if (prev_noret) {
 +                              kdbnearsym(rip, &symtab);
 +                              if (rip == symtab.sym_start) {
 +                                      --rip;
 +                                      if (KDB_DEBUG(ARA))
 +                                              kdb_printf("\tprev_noret, " RIP
 +                                                         "=0x%lx\n", rip);
 +                              }
 +                      }
 +              }
 +              kdbnearsym(rip, &symtab);
 +              if (old_style) {
 +                      if (__kernel_text_address(rip) && !suppress) {
 +                              bt_print_one(rip, rsp, &ar, &symtab, 0);
 +                              ++count;
 +                      }
 +                      if (rsp == (unsigned long)regs) {
 +                              if (ar.stack.next && suppress)
 +                                      kdb_printf(" ======================= <%s>\n",
 +                                                 ar.stack.id);
 +                              ++count;
 +                              suppress = 0;
 +                      }
 +                      rsp += sizeof(rip);
 +                      rip_at_rsp = 1;
 +                      if (rsp >= ar.stack.logical_end) {
 +                              if (!ar.stack.next)
 +                                      break;
 +                              kdba_bt_new_stack(&ar, &rsp, &count, &suppress);
 +                              rip_at_rsp = 0;
 +                              continue;
 +                      }
 +              } else {
 +                      /* Start each analysis with no dynamic data from the
 +                       * previous kdb_bb() run.
 +                       */
 +                      bb_cleanup();
 +                      kdb_bb(rip);
 +                      if (bb_giveup)
 +                              break;
 +                      prev_interrupt_handler = interrupt_handler;
 +                      interrupt_handler = bb_interrupt_handler(rip);
 +                      prev_rsp = rsp;
 +                      if (rip_at_rsp) {
 +                              if (prev_interrupt_handler) {
 +                                      cs = *((kdb_machreg_t *)rsp + 1) & 0xffff;
 +                                      hardware_pushed =
 +                                              bb_hardware_pushed_arch(rsp, &ar);
 +                              } else {
 +                                      cs = __KERNEL_CS;
 +                                      hardware_pushed = 0;
 +                              }
 +                              rsp += sizeof(rip) + hardware_pushed;
 +                              if (KDB_DEBUG(ARA))
 +                                      kdb_printf("%s: " RSP " "
 +                                                 kdb_machreg_fmt0
 +                                                 " -> " kdb_machreg_fmt0
 +                                                 " hardware_pushed %d"
 +                                                 " prev_interrupt_handler %d"
 +                                                 " cs 0x%lx\n",
 +                                                 __FUNCTION__,
 +                                                 prev_rsp,
 +                                                 rsp,
 +                                                 hardware_pushed,
 +                                                 prev_interrupt_handler,
 +                                                 cs);
 +                              if (rsp >= ar.stack.logical_end &&
 +                                  ar.stack.next) {
 +                                      kdba_bt_new_stack(&ar, &rsp, &count,
 +                                                         &suppress);
 +                                      rip_at_rsp = 0;
 +                                      continue;
 +                              }
 +                              bb_actual_set_value(BBRG_RSP, rsp);
 +                      } else {
 +                              cs = __KERNEL_CS;
 +                      }
 +                      rip_at_rsp = 1;
 +                      bb_actual_rollback(&ar);
 +                      if (bb_giveup)
 +                              break;
 +                      if (bb_actual_value(BBRG_RSP) < rsp) {
 +                              kdb_printf("%s: " RSP " is going backwards, "
 +                                         kdb_machreg_fmt0 " -> "
 +                                         kdb_machreg_fmt0 "\n",
 +                                         __FUNCTION__,
 +                                         rsp,
 +                                         bb_actual_value(BBRG_RSP));
 +                              bb_giveup = 1;
 +                              break;
 +                      }
 +                      bb_arguments(&ar);
 +                      if (!suppress) {
 +                              bt_print_one(rip, prev_rsp, &ar, &symtab, argcount);
 +                              ++count;
 +                      }
 +                      /* Functions that terminate the backtrace */
 +                      if (strcmp(bb_func_name, "cpu_idle") == 0 ||
 +                          strcmp(bb_func_name, "child_rip") == 0)
 +                              break;
 +                      if (rsp >= ar.stack.logical_end &&
 +                          !ar.stack.next)
 +                              break;
 +                      if (rsp <= (unsigned long)regs &&
 +                          bb_actual_value(BBRG_RSP) > (unsigned long)regs) {
 +                              if (ar.stack.next && suppress)
 +                                      kdb_printf(" ======================= <%s>\n",
 +                                                 ar.stack.id);
 +                              ++count;
 +                              suppress = 0;
 +                      }
 +                      if (cs != __KERNEL_CS) {
 +                              kdb_printf("Reached user space\n");
 +                              break;
 +                      }
 +                      rsp = bb_actual_value(BBRG_RSP);
 +              }
 +              prev_noret = bb_noret(bb_func_name);
 +              if (count > 200)
 +                      break;
 +      }
 +      if (bb_giveup)
 +              return 1;
 +      bb_cleanup();
 +      kdbnearsym_cleanup();
 +
 +      if (count > 200) {
 +              kdb_printf("bt truncated, count limit reached\n");
 +              return 1;
 +      } else if (suppress) {
 +              kdb_printf
 +                  ("bt did not find pt_regs - no trace produced.  Suggest 'set BTSP 1'\n");
 +              return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * kdba_bt_address
 + *
 + *    Do a backtrace starting at a specified stack address.  Use this if the
 + *    heuristics get the stack decode wrong.
 + *
 + * Inputs:
 + *    addr    Address provided to 'bt' command.
 + *    argcount
 + * Outputs:
 + *    None.
 + * Returns:
 + *    zero for success, a kdb diagnostic if error
 + * Locking:
 + *    none.
 + * Remarks:
 + *    mds %rsp comes in handy when examining the stack to do a manual
 + *    traceback.
 + */
 +
 +int kdba_bt_address(kdb_machreg_t addr, int argcount)
 +{
 +      int ret;
 +      kdba_id_init(&kdb_di);                  /* kdb_bb needs this done once */
 +      ret = kdba_bt_stack(addr, argcount, NULL, 0);
 +      if (ret == 1)
 +              ret = kdba_bt_stack(addr, argcount, NULL, 1);
 +      return ret;
 +}
 +
 +/*
 + * kdba_bt_process
 + *
 + *    Do a backtrace for a specified process.
 + *
 + * Inputs:
 + *    p       Struct task pointer extracted by 'bt' command.
 + *    argcount
 + * Outputs:
 + *    None.
 + * Returns:
 + *    zero for success, a kdb diagnostic if error
 + * Locking:
 + *    none.
 + */
 +
 +int kdba_bt_process(const struct task_struct *p, int argcount)
 +{
 +      int ret;
 +      kdba_id_init(&kdb_di);                  /* kdb_bb needs this done once */
 +      ret = kdba_bt_stack(0, argcount, p, 0);
 +      if (ret == 1)
 +              ret = kdba_bt_stack(0, argcount, p, 1);
 +      return ret;
 +}
 +
 +static int __init kdba_bt_x86_init(void)
 +{
 +      int i, c, cp = -1;
 +      struct bb_name_state *r;
 +
 +      kdb_register_repeat("bb1", kdb_bb1, "<vaddr>",  "Analyse one basic block", 0, KDB_REPEAT_NONE);
 +      kdb_register_repeat("bb_all", kdb_bb_all, "",   "Backtrace check on all built in functions", 0, KDB_REPEAT_NONE);
 +
 +      /* Split the opcode usage table by the first letter of each set of
 +       * opcodes, for faster mapping of opcode to its operand usage.
 +       */
 +      for (i = 0; i < ARRAY_SIZE(bb_opcode_usage_all); ++i) {
 +              c = bb_opcode_usage_all[i].opcode[0] - 'a';
 +              if (c != cp) {
 +                      cp = c;
 +                      bb_opcode_usage[c].opcode = bb_opcode_usage_all + i;
 +              }
 +              ++bb_opcode_usage[c].size;
 +      }
 +
 +      bb_common_interrupt = kallsyms_lookup_name("common_interrupt");
 +      bb_error_entry = kallsyms_lookup_name("error_entry");
 +      bb_ret_from_intr = kallsyms_lookup_name("ret_from_intr");
 +      bb_thread_return = kallsyms_lookup_name("thread_return");
 +      bb_sync_regs = kallsyms_lookup_name("sync_regs");
 +      bb_save_v86_state = kallsyms_lookup_name("save_v86_state");
 +      bb__sched_text_start = kallsyms_lookup_name("__sched_text_start");
 +      bb__sched_text_end = kallsyms_lookup_name("__sched_text_end");
 +      bb_save_args = kallsyms_lookup_name("save_args");
 +      bb_save_rest = kallsyms_lookup_name("save_rest");
 +      bb_save_paranoid = kallsyms_lookup_name("save_paranoid");
 +      for (i = 0, r = bb_special_cases;
 +           i < ARRAY_SIZE(bb_special_cases);
 +           ++i, ++r) {
 +              r->address = kallsyms_lookup_name(r->name);
 +      }
 +
 +#ifdef        CONFIG_4KSTACKS
 +      kdba_hardirq_ctx = (struct thread_info **)kallsyms_lookup_name("hardirq_ctx");
 +      kdba_softirq_ctx = (struct thread_info **)kallsyms_lookup_name("softirq_ctx");
 +#endif        /* CONFIG_4KSTACKS */
 +
 +      return 0;
 +}
 +
 +static void __exit kdba_bt_x86_exit(void)
 +{
 +      kdb_unregister("bb1");
 +      kdb_unregister("bb_all");
 +}
 +
 +module_init(kdba_bt_x86_init)
 +module_exit(kdba_bt_x86_exit)
index 417b7c6,0000000..4d1c88d
mode 100644,000000..100644
--- /dev/null
@@@ -1,1536 -1,0 +1,1536 @@@
 +/*
 + * Kernel Debugger Architecture Independent Support Functions
 + *
 + * This file is subject to the terms and conditions of the GNU General Public
 + * License.  See the file "COPYING" in the main directory of this archive
 + * for more details.
 + *
 + * Copyright (c) 1999-2008 Silicon Graphics, Inc.  All Rights Reserved.
 + */
 +
 +#include <linux/string.h>
 +#include <linux/stddef.h>
 +#include <linux/kernel.h>
 +#include <linux/module.h>
 +#include <linux/init.h>
 +#include <linux/irq.h>
 +#include <linux/ptrace.h>
 +#include <linux/mm.h>
 +#include <linux/sched.h>
 +#include <linux/hardirq.h>
 +#include <linux/kdb.h>
 +#include <linux/kdbprivate.h>
 +#include <linux/interrupt.h>
 +#include <linux/kdebug.h>
 +#include <linux/cpumask.h>
 +
 +#include <asm/processor.h>
 +#include <asm/msr.h>
 +#include <asm/uaccess.h>
 +#include <asm/desc.h>
 +
 +static kdb_machreg_t
 +kdba_getcr(int regnum)
 +{
 +      kdb_machreg_t contents = 0;
 +      switch(regnum) {
 +      case 0:
 +              __asm__ (_ASM_MOV " %%cr0,%0\n\t":"=r"(contents));
 +              break;
 +      case 1:
 +              break;
 +      case 2:
 +              __asm__ (_ASM_MOV " %%cr2,%0\n\t":"=r"(contents));
 +              break;
 +      case 3:
 +              __asm__ (_ASM_MOV " %%cr3,%0\n\t":"=r"(contents));
 +              break;
 +      case 4:
 +              __asm__ (_ASM_MOV " %%cr4,%0\n\t":"=r"(contents));
 +              break;
 +      default:
 +              break;
 +      }
 +
 +      return contents;
 +}
 +
 +void
 +kdba_putdr(int regnum, kdb_machreg_t contents)
 +{
 +      switch(regnum) {
 +      case 0:
 +              __asm__ (_ASM_MOV " %0,%%db0\n\t"::"r"(contents));
 +              break;
 +      case 1:
 +              __asm__ (_ASM_MOV " %0,%%db1\n\t"::"r"(contents));
 +              break;
 +      case 2:
 +              __asm__ (_ASM_MOV " %0,%%db2\n\t"::"r"(contents));
 +              break;
 +      case 3:
 +              __asm__ (_ASM_MOV " %0,%%db3\n\t"::"r"(contents));
 +              break;
 +      case 4:
 +      case 5:
 +              break;
 +      case 6:
 +              __asm__ (_ASM_MOV " %0,%%db6\n\t"::"r"(contents));
 +              break;
 +      case 7:
 +              __asm__ (_ASM_MOV " %0,%%db7\n\t"::"r"(contents));
 +              break;
 +      default:
 +              break;
 +      }
 +}
 +
 +kdb_machreg_t
 +kdba_getdr(int regnum)
 +{
 +      kdb_machreg_t contents = 0;
 +      switch(regnum) {
 +      case 0:
 +              __asm__ (_ASM_MOV " %%db0,%0\n\t":"=r"(contents));
 +              break;
 +      case 1:
 +              __asm__ (_ASM_MOV " %%db1,%0\n\t":"=r"(contents));
 +              break;
 +      case 2:
 +              __asm__ (_ASM_MOV " %%db2,%0\n\t":"=r"(contents));
 +              break;
 +      case 3:
 +              __asm__ (_ASM_MOV " %%db3,%0\n\t":"=r"(contents));
 +              break;
 +      case 4:
 +      case 5:
 +              break;
 +      case 6:
 +              __asm__ (_ASM_MOV " %%db6,%0\n\t":"=r"(contents));
 +              break;
 +      case 7:
 +              __asm__ (_ASM_MOV " %%db7,%0\n\t":"=r"(contents));
 +              break;
 +      default:
 +              break;
 +      }
 +
 +      return contents;
 +}
 +
 +kdb_machreg_t
 +kdba_getdr6(void)
 +{
 +      return kdba_getdr(6);
 +}
 +
 +kdb_machreg_t
 +kdba_getdr7(void)
 +{
 +      return kdba_getdr(7);
 +}
 +
 +void
 +kdba_putdr6(kdb_machreg_t contents)
 +{
 +      kdba_putdr(6, contents);
 +}
 +
 +static void
 +kdba_putdr7(kdb_machreg_t contents)
 +{
 +      kdba_putdr(7, contents);
 +}
 +
 +void
 +kdba_installdbreg(kdb_bp_t *bp)
 +{
 +      int cpu = smp_processor_id();
 +
 +      kdb_machreg_t dr7;
 +
 +      dr7 = kdba_getdr7();
 +
 +      kdba_putdr(bp->bp_hard[cpu]->bph_reg, bp->bp_addr);
 +
 +      dr7 |= DR7_GE;
 +      if (cpu_has_de)
 +              set_in_cr4(X86_CR4_DE);
 +
 +      switch (bp->bp_hard[cpu]->bph_reg){
 +      case 0:
 +              DR7_RW0SET(dr7,bp->bp_hard[cpu]->bph_mode);
 +              DR7_LEN0SET(dr7,bp->bp_hard[cpu]->bph_length);
 +              DR7_G0SET(dr7);
 +              break;
 +      case 1:
 +              DR7_RW1SET(dr7,bp->bp_hard[cpu]->bph_mode);
 +              DR7_LEN1SET(dr7,bp->bp_hard[cpu]->bph_length);
 +              DR7_G1SET(dr7);
 +              break;
 +      case 2:
 +              DR7_RW2SET(dr7,bp->bp_hard[cpu]->bph_mode);
 +              DR7_LEN2SET(dr7,bp->bp_hard[cpu]->bph_length);
 +              DR7_G2SET(dr7);
 +              break;
 +      case 3:
 +              DR7_RW3SET(dr7,bp->bp_hard[cpu]->bph_mode);
 +              DR7_LEN3SET(dr7,bp->bp_hard[cpu]->bph_length);
 +              DR7_G3SET(dr7);
 +              break;
 +      default:
 +              kdb_printf("kdb: Bad debug register!! %ld\n",
 +                         bp->bp_hard[cpu]->bph_reg);
 +              break;
 +      }
 +
 +      kdba_putdr7(dr7);
 +      return;
 +}
 +
 +void
 +kdba_removedbreg(kdb_bp_t *bp)
 +{
 +      int regnum;
 +      kdb_machreg_t dr7;
 +      int cpu = smp_processor_id();
 +
 +      if (!bp->bp_hard[cpu])
 +              return;
 +
 +      regnum = bp->bp_hard[cpu]->bph_reg;
 +
 +      dr7 = kdba_getdr7();
 +
 +      kdba_putdr(regnum, 0);
 +
 +      switch (regnum) {
 +      case 0:
 +              DR7_G0CLR(dr7);
 +              DR7_L0CLR(dr7);
 +              break;
 +      case 1:
 +              DR7_G1CLR(dr7);
 +              DR7_L1CLR(dr7);
 +              break;
 +      case 2:
 +              DR7_G2CLR(dr7);
 +              DR7_L2CLR(dr7);
 +              break;
 +      case 3:
 +              DR7_G3CLR(dr7);
 +              DR7_L3CLR(dr7);
 +              break;
 +      default:
 +              kdb_printf("kdb: Bad debug register!! %d\n", regnum);
 +              break;
 +      }
 +
 +      kdba_putdr7(dr7);
 +}
 +
 +struct kdbregs {
 +      char   *reg_name;
 +      size_t  reg_offset;
 +};
 +
 +static struct kdbregs dbreglist[] = {
 +      { "dr0",        0 },
 +      { "dr1",        1 },
 +      { "dr2",        2 },
 +      { "dr3",        3 },
 +      { "dr6",        6 },
 +      { "dr7",        7 },
 +};
 +
 +static const int ndbreglist = sizeof(dbreglist) / sizeof(struct kdbregs);
 +
 +#ifdef CONFIG_X86_32
 +static struct kdbregs kdbreglist[] = {
 +      { "ax",         offsetof(struct pt_regs, ax) },
 +      { "bx",         offsetof(struct pt_regs, bx) },
 +      { "cx",         offsetof(struct pt_regs, cx) },
 +      { "dx",         offsetof(struct pt_regs, dx) },
 +
 +      { "si",         offsetof(struct pt_regs, si) },
 +      { "di",         offsetof(struct pt_regs, di) },
 +      { "sp",         offsetof(struct pt_regs, sp) },
 +      { "ip",         offsetof(struct pt_regs, ip) },
 +
 +      { "bp",         offsetof(struct pt_regs, bp) },
 +      { "ss",         offsetof(struct pt_regs, ss) },
 +      { "cs",         offsetof(struct pt_regs, cs) },
 +      { "flags",      offsetof(struct pt_regs, flags) },
 +
 +      { "ds",         offsetof(struct pt_regs, ds) },
 +      { "es",         offsetof(struct pt_regs, es) },
 +      { "origax",     offsetof(struct pt_regs, orig_ax) },
 +
 +};
 +
 +static const int nkdbreglist = sizeof(kdbreglist) / sizeof(struct kdbregs);
 +
 +
 +/*
 + * kdba_getregcontents
 + *
 + *    Return the contents of the register specified by the
 + *    input string argument.   Return an error if the string
 + *    does not match a machine register.
 + *
 + *    The following pseudo register names are supported:
 + *       &regs         - Prints address of exception frame
 + *       kesp          - Prints kernel stack pointer at time of fault
 + *       cesp          - Prints current kernel stack pointer, inside kdb
 + *       ceflags       - Prints current flags, inside kdb
 + *       %<regname>    - Uses the value of the registers at the
 + *                       last time the user process entered kernel
 + *                       mode, instead of the registers at the time
 + *                       kdb was entered.
 + *
 + * Parameters:
 + *    regname         Pointer to string naming register
 + *    regs            Pointer to structure containing registers.
 + * Outputs:
 + *    *contents       Pointer to unsigned long to recieve register contents
 + * Returns:
 + *    0               Success
 + *    KDB_BADREG      Invalid register name
 + * Locking:
 + *    None.
 + * Remarks:
 + *    If kdb was entered via an interrupt from the kernel itself then
 + *    ss and sp are *not* on the stack.
 + */
 +
 +int
 +kdba_getregcontents(const char *regname,
 +                  struct pt_regs *regs,
 +                  kdb_machreg_t *contents)
 +{
 +      int i;
 +
 +      if (strcmp(regname, "cesp") == 0) {
 +              asm volatile("movl %%esp,%0":"=m" (*contents));
 +              return 0;
 +      }
 +
 +      if (strcmp(regname, "ceflags") == 0) {
 +              unsigned long flags;
 +              local_save_flags(flags);
 +              *contents = flags;
 +              return 0;
 +      }
 +
 +      if (regname[0] == '%') {
 +              /* User registers:  %%e[a-c]x, etc */
 +              regname++;
 +              regs = (struct pt_regs *)
 +                      (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
 +      }
 +
 +      for (i=0; i<ndbreglist; i++) {
 +              if (strnicmp(dbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < ndbreglist)
 +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
 +              *contents = kdba_getdr(dbreglist[i].reg_offset);
 +              return 0;
 +      }
 +
 +      if (!regs) {
 +              kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
 +              return KDB_BADREG;
 +      }
 +
 +      if (strcmp(regname, "&regs") == 0) {
 +              *contents = (unsigned long)regs;
 +              return 0;
 +      }
 +
 +      if (strcmp(regname, "kesp") == 0) {
 +              *contents = (unsigned long)regs + sizeof(struct pt_regs);
 +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
 +                      /* sp and ss are not on stack */
 +                      *contents -= 2*4;
 +              }
 +              return 0;
 +      }
 +
 +      for (i=0; i<nkdbreglist; i++) {
 +              if (strnicmp(kdbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < nkdbreglist)
 +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
 +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
 +                      /* No cpl switch, sp and ss are not on stack */
 +                      if (strcmp(kdbreglist[i].reg_name, "sp") == 0) {
 +                              *contents = (kdb_machreg_t)regs +
 +                                      sizeof(struct pt_regs) - 2*4;
 +                              return(0);
 +                      }
 +                      if (strcmp(kdbreglist[i].reg_name, "xss") == 0) {
 +                              asm volatile(
 +                                      "pushl %%ss\n"
 +                                      "popl %0\n"
 +                                      :"=m" (*contents));
 +                              return(0);
 +                      }
 +              }
 +              *contents = *(unsigned long *)((unsigned long)regs +
 +                              kdbreglist[i].reg_offset);
 +              return(0);
 +      }
 +
 +      return KDB_BADREG;
 +}
 +
 +/*
 + * kdba_setregcontents
 + *
 + *    Set the contents of the register specified by the
 + *    input string argument.   Return an error if the string
 + *    does not match a machine register.
 + *
 + *    Supports modification of user-mode registers via
 + *    %<register-name>
 + *
 + * Parameters:
 + *    regname         Pointer to string naming register
 + *    regs            Pointer to structure containing registers.
 + *    contents        Unsigned long containing new register contents
 + * Outputs:
 + * Returns:
 + *    0               Success
 + *    KDB_BADREG      Invalid register name
 + * Locking:
 + *    None.
 + * Remarks:
 + */
 +
 +int
 +kdba_setregcontents(const char *regname,
 +                struct pt_regs *regs,
 +                unsigned long contents)
 +{
 +      int i;
 +
 +      if (regname[0] == '%') {
 +              regname++;
 +              regs = (struct pt_regs *)
 +                      (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
 +      }
 +
 +      for (i=0; i<ndbreglist; i++) {
 +              if (strnicmp(dbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < ndbreglist)
 +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
 +              kdba_putdr(dbreglist[i].reg_offset, contents);
 +              return 0;
 +      }
 +
 +      if (!regs) {
 +              kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
 +              return KDB_BADREG;
 +      }
 +
 +      for (i=0; i<nkdbreglist; i++) {
 +              if (strnicmp(kdbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < nkdbreglist)
 +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
 +              *(unsigned long *)((unsigned long)regs
 +                                 + kdbreglist[i].reg_offset) = contents;
 +              return 0;
 +      }
 +
 +      return KDB_BADREG;
 +}
 +
 +/*
 + * kdba_pt_regs
 + *
 + *    Format a struct pt_regs
 + *
 + * Inputs:
 + *    argc    argument count
 + *    argv    argument vector
 + * Outputs:
 + *    None.
 + * Returns:
 + *    zero for success, a kdb diagnostic if error
 + * Locking:
 + *    none.
 + * Remarks:
 + *    If no address is supplied, it uses the last irq pt_regs.
 + */
 +
 +static int
 +kdba_pt_regs(int argc, const char **argv)
 +{
 +      int diag;
 +      kdb_machreg_t addr;
 +      long offset = 0;
 +      int nextarg;
 +      struct pt_regs *p;
 +      static const char *fmt = "  %-11.11s 0x%lx\n";
 +
 +      if (argc == 0) {
 +              addr = (kdb_machreg_t) get_irq_regs();
 +      } else if (argc == 1) {
 +              nextarg = 1;
 +              diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
 +              if (diag)
 +                      return diag;
 +      } else {
 +              return KDB_ARGCOUNT;
 +      }
 +
 +      p = (struct pt_regs *) addr;
 +      kdb_printf("struct pt_regs 0x%p-0x%p\n", p, (unsigned char *)p + sizeof(*p) - 1);
 +      kdb_print_nameval("bx", p->bx);
 +      kdb_print_nameval("cx", p->cx);
 +      kdb_print_nameval("dx", p->dx);
 +      kdb_print_nameval("si", p->si);
 +      kdb_print_nameval("di", p->di);
 +      kdb_print_nameval("bp", p->bp);
 +      kdb_print_nameval("ax", p->ax);
 +      kdb_printf(fmt, "ds", p->ds);
 +      kdb_printf(fmt, "es", p->es);
 +      kdb_print_nameval("orig_ax", p->orig_ax);
 +      kdb_print_nameval("ip", p->ip);
 +      kdb_printf(fmt, "cs", p->cs);
 +      kdb_printf(fmt, "flags", p->flags);
 +      kdb_printf(fmt, "sp", p->sp);
 +      kdb_printf(fmt, "ss", p->ss);
 +      return 0;
 +}
 +
 +#else /* CONFIG_X86_32 */
 +
 +static struct kdbregs kdbreglist[] = {
 +      { "r15",        offsetof(struct pt_regs, r15) },
 +      { "r14",        offsetof(struct pt_regs, r14) },
 +      { "r13",        offsetof(struct pt_regs, r13) },
 +      { "r12",        offsetof(struct pt_regs, r12) },
 +      { "bp",         offsetof(struct pt_regs, bp) },
 +      { "bx",         offsetof(struct pt_regs, bx) },
 +      { "r11",        offsetof(struct pt_regs, r11) },
 +      { "r10",        offsetof(struct pt_regs, r10) },
 +      { "r9",         offsetof(struct pt_regs, r9) },
 +      { "r8",         offsetof(struct pt_regs, r8) },
 +      { "ax",         offsetof(struct pt_regs, ax) },
 +      { "cx",         offsetof(struct pt_regs, cx) },
 +      { "dx",         offsetof(struct pt_regs, dx) },
 +      { "si",         offsetof(struct pt_regs, si) },
 +      { "di",         offsetof(struct pt_regs, di) },
 +      { "orig_ax",    offsetof(struct pt_regs, orig_ax) },
 +      { "ip",         offsetof(struct pt_regs, ip) },
 +      { "cs",         offsetof(struct pt_regs, cs) },
 +      { "flags",      offsetof(struct pt_regs, flags) },
 +      { "sp",         offsetof(struct pt_regs, sp) },
 +      { "ss",         offsetof(struct pt_regs, ss) },
 +};
 +
 +static const int nkdbreglist = sizeof(kdbreglist) / sizeof(struct kdbregs);
 +
 +
 +/*
 + * kdba_getregcontents
 + *
 + *    Return the contents of the register specified by the
 + *    input string argument.   Return an error if the string
 + *    does not match a machine register.
 + *
 + *    The following pseudo register names are supported:
 + *       &regs         - Prints address of exception frame
 + *       krsp          - Prints kernel stack pointer at time of fault
 + *       crsp          - Prints current kernel stack pointer, inside kdb
 + *       ceflags       - Prints current flags, inside kdb
 + *       %<regname>    - Uses the value of the registers at the
 + *                       last time the user process entered kernel
 + *                       mode, instead of the registers at the time
 + *                       kdb was entered.
 + *
 + * Parameters:
 + *    regname         Pointer to string naming register
 + *    regs            Pointer to structure containing registers.
 + * Outputs:
 + *    *contents       Pointer to unsigned long to recieve register contents
 + * Returns:
 + *    0               Success
 + *    KDB_BADREG      Invalid register name
 + * Locking:
 + *    None.
 + * Remarks:
 + *    If kdb was entered via an interrupt from the kernel itself then
 + *    ss and sp are *not* on the stack.
 + */
 +int
 +kdba_getregcontents(const char *regname,
 +                  struct pt_regs *regs,
 +                  kdb_machreg_t *contents)
 +{
 +      int i;
 +
 +      if (strcmp(regname, "&regs") == 0) {
 +              *contents = (unsigned long)regs;
 +              return 0;
 +      }
 +
 +      if (strcmp(regname, "krsp") == 0) {
 +              *contents = (unsigned long)regs + sizeof(struct pt_regs);
 +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
 +                      /* sp and ss are not on stack */
 +                      *contents -= 2*4;
 +              }
 +              return 0;
 +      }
 +
 +      if (strcmp(regname, "crsp") == 0) {
 +              asm volatile("movq %%rsp,%0":"=m" (*contents));
 +              return 0;
 +      }
 +
 +      if (strcmp(regname, "ceflags") == 0) {
 +              unsigned long flags;
 +              local_save_flags(flags);
 +              *contents = flags;
 +              return 0;
 +      }
 +
 +      if (regname[0] == '%') {
 +              /* User registers:  %%r[a-c]x, etc */
 +              regname++;
 +              regs = (struct pt_regs *)
 +                      (current->thread.sp0 - sizeof(struct pt_regs));
 +      }
 +
 +      for (i=0; i<nkdbreglist; i++) {
 +              if (strnicmp(kdbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < nkdbreglist)
 +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
 +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
 +                      /* No cpl switch, sp is not on stack */
 +                      if (strcmp(kdbreglist[i].reg_name, "sp") == 0) {
 +                              *contents = (kdb_machreg_t)regs +
 +                                      sizeof(struct pt_regs) - 2*8;
 +                              return(0);
 +                      }
 +#if 0 /* FIXME */
 +                      if (strcmp(kdbreglist[i].reg_name, "ss") == 0) {
 +                              kdb_machreg_t r;
 +
 +                              r = (kdb_machreg_t)regs +
 +                                      sizeof(struct pt_regs) - 2*8;
 +                              *contents = (kdb_machreg_t)SS(r);       /* XXX */
 +                              return(0);
 +                      }
 +#endif
 +              }
 +              *contents = *(unsigned long *)((unsigned long)regs +
 +                              kdbreglist[i].reg_offset);
 +              return(0);
 +      }
 +
 +      for (i=0; i<ndbreglist; i++) {
 +              if (strnicmp(dbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < ndbreglist)
 +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
 +              *contents = kdba_getdr(dbreglist[i].reg_offset);
 +              return 0;
 +      }
 +      return KDB_BADREG;
 +}
 +
 +/*
 + * kdba_setregcontents
 + *
 + *    Set the contents of the register specified by the
 + *    input string argument.   Return an error if the string
 + *    does not match a machine register.
 + *
 + *    Supports modification of user-mode registers via
 + *    %<register-name>
 + *
 + * Parameters:
 + *    regname         Pointer to string naming register
 + *    regs            Pointer to structure containing registers.
 + *    contents        Unsigned long containing new register contents
 + * Outputs:
 + * Returns:
 + *    0               Success
 + *    KDB_BADREG      Invalid register name
 + * Locking:
 + *    None.
 + * Remarks:
 + */
 +
 +int
 +kdba_setregcontents(const char *regname,
 +                struct pt_regs *regs,
 +                unsigned long contents)
 +{
 +      int i;
 +
 +      if (regname[0] == '%') {
 +              regname++;
 +              regs = (struct pt_regs *)
 +                      (current->thread.sp0 - sizeof(struct pt_regs));
 +      }
 +
 +      for (i=0; i<nkdbreglist; i++) {
 +              if (strnicmp(kdbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < nkdbreglist)
 +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
 +              *(unsigned long *)((unsigned long)regs
 +                                 + kdbreglist[i].reg_offset) = contents;
 +              return 0;
 +      }
 +
 +      for (i=0; i<ndbreglist; i++) {
 +              if (strnicmp(dbreglist[i].reg_name,
 +                           regname,
 +                           strlen(regname)) == 0)
 +                      break;
 +      }
 +
 +      if ((i < ndbreglist)
 +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
 +              kdba_putdr(dbreglist[i].reg_offset, contents);
 +              return 0;
 +      }
 +
 +      return KDB_BADREG;
 +}
 +
 +/*
 + * kdba_pt_regs
 + *
 + *    Format a struct pt_regs
 + *
 + * Inputs:
 + *    argc    argument count
 + *    argv    argument vector
 + * Outputs:
 + *    None.
 + * Returns:
 + *    zero for success, a kdb diagnostic if error
 + * Locking:
 + *    none.
 + * Remarks:
 + *    If no address is supplied, it uses the last irq pt_regs.
 + */
 +
 +static int
 +kdba_pt_regs(int argc, const char **argv)
 +{
 +      int diag;
 +      kdb_machreg_t addr;
 +      long offset = 0;
 +      int nextarg;
 +      struct pt_regs *p;
 +      static const char *fmt = "  %-11.11s 0x%lx\n";
 +      static int first_time = 1;
 +
 +      if (argc == 0) {
 +              addr = (kdb_machreg_t) get_irq_regs();
 +      } else if (argc == 1) {
 +              nextarg = 1;
 +              diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
 +              if (diag)
 +                      return diag;
 +      } else {
 +              return KDB_ARGCOUNT;
 +      }
 +
 +      p = (struct pt_regs *) addr;
 +      if (first_time) {
 +              first_time = 0;
 +              kdb_printf("\n+++ Warning: x86_64 pt_regs are not always "
 +                         "completely defined, r15-bx may be invalid\n\n");
 +      }
 +      kdb_printf("struct pt_regs 0x%p-0x%p\n", p, (unsigned char *)p + sizeof(*p) - 1);
 +      kdb_print_nameval("r15", p->r15);
 +      kdb_print_nameval("r14", p->r14);
 +      kdb_print_nameval("r13", p->r13);
 +      kdb_print_nameval("r12", p->r12);
 +      kdb_print_nameval("bp", p->bp);
 +      kdb_print_nameval("bx", p->bx);
 +      kdb_print_nameval("r11", p->r11);
 +      kdb_print_nameval("r10", p->r10);
 +      kdb_print_nameval("r9", p->r9);
 +      kdb_print_nameval("r8", p->r8);
 +      kdb_print_nameval("ax", p->ax);
 +      kdb_print_nameval("cx", p->cx);
 +      kdb_print_nameval("dx", p->dx);
 +      kdb_print_nameval("si", p->si);
 +      kdb_print_nameval("di", p->di);
 +      kdb_print_nameval("orig_ax", p->orig_ax);
 +      kdb_print_nameval("ip", p->ip);
 +      kdb_printf(fmt, "cs", p->cs);
 +      kdb_printf(fmt, "flags", p->flags);
 +      kdb_printf(fmt, "sp", p->sp);
 +      kdb_printf(fmt, "ss", p->ss);
 +      return 0;
 +}
 +#endif /* CONFIG_X86_32 */
 +
 +/*
 + * kdba_dumpregs
 + *
 + *    Dump the specified register set to the display.
 + *
 + * Parameters:
 + *    regs            Pointer to structure containing registers.
 + *    type            Character string identifying register set to dump
 + *    extra           string further identifying register (optional)
 + * Outputs:
 + * Returns:
 + *    0               Success
 + * Locking:
 + *    None.
 + * Remarks:
 + *    This function will dump the general register set if the type
 + *    argument is NULL (struct pt_regs).   The alternate register
 + *    set types supported by this function:
 + *
 + *    d               Debug registers
 + *    c               Control registers
 + *    u               User registers at most recent entry to kernel
 + *                    for the process currently selected with "pid" command.
 + * Following not yet implemented:
 + *    r               Memory Type Range Registers (extra defines register)
 + *
 + * MSR on i386/x86_64 are handled by rdmsr/wrmsr commands.
 + */
 +
 +int
 +kdba_dumpregs(struct pt_regs *regs,
 +          const char *type,
 +          const char *extra)
 +{
 +      int i;
 +      int count = 0;
 +
 +      if (type
 +       && (type[0] == 'u')) {
 +              type = NULL;
 +              regs = (struct pt_regs *)
 +                      (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
 +      }
 +
 +      if (type == NULL) {
 +              struct kdbregs *rlp;
 +              kdb_machreg_t contents;
 +
 +              if (!regs) {
 +                      kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
 +                      return KDB_BADREG;
 +              }
 +
 +#ifdef CONFIG_X86_32
 +              for (i=0, rlp=kdbreglist; i<nkdbreglist; i++,rlp++) {
 +                      kdb_printf("%s = ", rlp->reg_name);
 +                      kdba_getregcontents(rlp->reg_name, regs, &contents);
 +                      kdb_printf("0x%08lx ", contents);
 +                      if ((++count % 4) == 0)
 +                              kdb_printf("\n");
 +              }
 +#else
 +              for (i=0, rlp=kdbreglist; i<nkdbreglist; i++,rlp++) {
 +                      kdb_printf("%8s = ", rlp->reg_name);
 +                      kdba_getregcontents(rlp->reg_name, regs, &contents);
 +                      kdb_printf("0x%016lx ", contents);
 +                      if ((++count % 2) == 0)
 +                              kdb_printf("\n");
 +              }
 +#endif
 +
 +              kdb_printf("&regs = 0x%p\n", regs);
 +
 +              return 0;
 +      }
 +
 +      switch (type[0]) {
 +      case 'd':
 +      {
 +              unsigned long dr[8];
 +
 +              for(i=0; i<8; i++) {
 +                      if ((i == 4) || (i == 5)) continue;
 +                      dr[i] = kdba_getdr(i);
 +              }
 +              kdb_printf("dr0 = 0x%08lx  dr1 = 0x%08lx  dr2 = 0x%08lx  dr3 = 0x%08lx\n",
 +                         dr[0], dr[1], dr[2], dr[3]);
 +              kdb_printf("dr6 = 0x%08lx  dr7 = 0x%08lx\n",
 +                         dr[6], dr[7]);
 +              return 0;
 +      }
 +      case 'c':
 +      {
 +              unsigned long cr[5];
 +
 +              for (i=0; i<5; i++) {
 +                      cr[i] = kdba_getcr(i);
 +              }
 +              kdb_printf("cr0 = 0x%08lx  cr1 = 0x%08lx  cr2 = 0x%08lx  cr3 = 0x%08lx\ncr4 = 0x%08lx\n",
 +                         cr[0], cr[1], cr[2], cr[3], cr[4]);
 +              return 0;
 +      }
 +      case 'r':
 +              break;
 +      default:
 +              return KDB_BADREG;
 +      }
 +
 +      /* NOTREACHED */
 +      return 0;
 +}
 +EXPORT_SYMBOL(kdba_dumpregs);
 +
 +kdb_machreg_t
 +kdba_getpc(struct pt_regs *regs)
 +{
 +      return regs ? regs->ip : 0;
 +}
 +
 +int
 +kdba_setpc(struct pt_regs *regs, kdb_machreg_t newpc)
 +{
 +      if (KDB_NULL_REGS(regs))
 +              return KDB_BADREG;
 +      regs->ip = newpc;
 +      KDB_STATE_SET(IP_ADJUSTED);
 +      return 0;
 +}
 +
 +/*
 + * kdba_main_loop
 + *
 + *    Do any architecture specific set up before entering the main kdb loop.
 + *    The primary function of this routine is to make all processes look the
 + *    same to kdb, kdb must be able to list a process without worrying if the
 + *    process is running or blocked, so make all process look as though they
 + *    are blocked.
 + *
 + * Inputs:
 + *    reason          The reason KDB was invoked
 + *    error           The hardware-defined error code
 + *    error2          kdb's current reason code.  Initially error but can change
 + *                    acording to kdb state.
 + *    db_result       Result from break or debug point.
 + *    regs            The exception frame at time of fault/breakpoint.  If reason
 + *                    is SILENT or CPU_UP then regs is NULL, otherwise it should
 + *                    always be valid.
 + * Returns:
 + *    0       KDB was invoked for an event which it wasn't responsible
 + *    1       KDB handled the event for which it was invoked.
 + * Outputs:
 + *    Sets ip and sp in current->thread.
 + * Locking:
 + *    None.
 + * Remarks:
 + *    none.
 + */
 +
 +int
 +kdba_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
 +             kdb_dbtrap_t db_result, struct pt_regs *regs)
 +{
 +      int ret;
 +
 +#ifdef CONFIG_X86_64
 +      if (regs)
 +              kdba_getregcontents("sp", regs, &(current->thread.sp));
 +#endif
 +      ret = kdb_save_running(regs, reason, reason2, error, db_result);
 +      kdb_unsave_running(regs);
 +      return ret;
 +}
 +
 +void
 +kdba_disableint(kdb_intstate_t *state)
 +{
 +      unsigned long *fp = (unsigned long *)state;
 +      unsigned long flags;
 +
 +      local_irq_save(flags);
 +      *fp = flags;
 +}
 +
 +void
 +kdba_restoreint(kdb_intstate_t *state)
 +{
 +      unsigned long flags = *(unsigned long *)state;
 +      local_irq_restore(flags);
 +}
 +
 +void
 +kdba_setsinglestep(struct pt_regs *regs)
 +{
 +      if (KDB_NULL_REGS(regs))
 +              return;
 +      if (regs->flags & X86_EFLAGS_IF)
 +              KDB_STATE_SET(A_IF);
 +      else
 +              KDB_STATE_CLEAR(A_IF);
 +      regs->flags = (regs->flags | X86_EFLAGS_TF) & ~X86_EFLAGS_IF;
 +}
 +
 +void
 +kdba_clearsinglestep(struct pt_regs *regs)
 +{
 +      if (KDB_NULL_REGS(regs))
 +              return;
 +      if (KDB_STATE(A_IF))
 +              regs->flags |= X86_EFLAGS_IF;
 +      else
 +              regs->flags &= ~X86_EFLAGS_IF;
 +}
 +
 +#ifdef CONFIG_X86_32
 +int asmlinkage
 +kdba_setjmp(kdb_jmp_buf *jb)
 +{
 +#ifdef CONFIG_FRAME_POINTER
 +      __asm__ ("movl 8(%esp), %eax\n\t"
 +               "movl %ebx, 0(%eax)\n\t"
 +               "movl %esi, 4(%eax)\n\t"
 +               "movl %edi, 8(%eax)\n\t"
 +               "movl (%esp), %ecx\n\t"
 +               "movl %ecx, 12(%eax)\n\t"
 +               "leal 8(%esp), %ecx\n\t"
 +               "movl %ecx, 16(%eax)\n\t"
 +               "movl 4(%esp), %ecx\n\t"
 +               "movl %ecx, 20(%eax)\n\t");
 +#else  /* CONFIG_FRAME_POINTER */
 +      __asm__ ("movl 4(%esp), %eax\n\t"
 +               "movl %ebx, 0(%eax)\n\t"
 +               "movl %esi, 4(%eax)\n\t"
 +               "movl %edi, 8(%eax)\n\t"
 +               "movl %ebp, 12(%eax)\n\t"
 +               "leal 4(%esp), %ecx\n\t"
 +               "movl %ecx, 16(%eax)\n\t"
 +               "movl 0(%esp), %ecx\n\t"
 +               "movl %ecx, 20(%eax)\n\t");
 +#endif   /* CONFIG_FRAME_POINTER */
 +      return 0;
 +}
 +
 +void asmlinkage
 +kdba_longjmp(kdb_jmp_buf *jb, int reason)
 +{
 +#ifdef CONFIG_FRAME_POINTER
 +      __asm__("movl 8(%esp), %ecx\n\t"
 +              "movl 12(%esp), %eax\n\t"
 +              "movl 20(%ecx), %edx\n\t"
 +              "movl 0(%ecx), %ebx\n\t"
 +              "movl 4(%ecx), %esi\n\t"
 +              "movl 8(%ecx), %edi\n\t"
 +              "movl 12(%ecx), %ebp\n\t"
 +              "movl 16(%ecx), %esp\n\t"
 +              "jmp *%edx\n");
 +#else    /* CONFIG_FRAME_POINTER */
 +      __asm__("movl 4(%esp), %ecx\n\t"
 +              "movl 8(%esp), %eax\n\t"
 +              "movl 20(%ecx), %edx\n\t"
 +              "movl 0(%ecx), %ebx\n\t"
 +              "movl 4(%ecx), %esi\n\t"
 +              "movl 8(%ecx), %edi\n\t"
 +              "movl 12(%ecx), %ebp\n\t"
 +              "movl 16(%ecx), %esp\n\t"
 +              "jmp *%edx\n");
 +#endif         /* CONFIG_FRAME_POINTER */
 +}
 +
 +#else /* CONFIG_X86_32 */
 +
 +int asmlinkage
 +kdba_setjmp(kdb_jmp_buf *jb)
 +{
 +#ifdef        CONFIG_FRAME_POINTER
 +      __asm__ __volatile__
 +              ("movq %%rbx, (0*8)(%%rdi);"
 +              "movq %%rcx, (1*8)(%%rdi);"
 +              "movq %%r12, (2*8)(%%rdi);"
 +              "movq %%r13, (3*8)(%%rdi);"
 +              "movq %%r14, (4*8)(%%rdi);"
 +              "movq %%r15, (5*8)(%%rdi);"
 +              "leaq 16(%%rsp), %%rdx;"
 +              "movq %%rdx, (6*8)(%%rdi);"
 +              "movq %%rax, (7*8)(%%rdi)"
 +              :
 +              : "a" (__builtin_return_address(0)),
 +                "c" (__builtin_frame_address(1))
 +              );
 +#else  /* !CONFIG_FRAME_POINTER */
 +      __asm__ __volatile__
 +              ("movq %%rbx, (0*8)(%%rdi);"
 +              "movq %%rbp, (1*8)(%%rdi);"
 +              "movq %%r12, (2*8)(%%rdi);"
 +              "movq %%r13, (3*8)(%%rdi);"
 +              "movq %%r14, (4*8)(%%rdi);"
 +              "movq %%r15, (5*8)(%%rdi);"
 +              "leaq 8(%%rsp), %%rdx;"
 +              "movq %%rdx, (6*8)(%%rdi);"
 +              "movq %%rax, (7*8)(%%rdi)"
 +              :
 +              : "a" (__builtin_return_address(0))
 +              );
 +#endif   /* CONFIG_FRAME_POINTER */
 +      return 0;
 +}
 +
 +void asmlinkage
 +kdba_longjmp(kdb_jmp_buf *jb, int reason)
 +{
 +      __asm__("movq (0*8)(%rdi),%rbx;"
 +              "movq (1*8)(%rdi),%rbp;"
 +              "movq (2*8)(%rdi),%r12;"
 +              "movq (3*8)(%rdi),%r13;"
 +              "movq (4*8)(%rdi),%r14;"
 +              "movq (5*8)(%rdi),%r15;"
 +              "movq (7*8)(%rdi),%rdx;"
 +              "movq (6*8)(%rdi),%rsp;"
 +              "mov %rsi, %rax;"
 +              "jmpq *%rdx");
 +}
 +#endif /* CONFIG_X86_32 */
 +
 +#ifdef CONFIG_X86_32
 +/*
 + * kdba_stackdepth
 + *
 + *    Print processes that are using more than a specific percentage of their
 + *    stack.
 + *
 + * Inputs:
 + *    argc    argument count
 + *    argv    argument vector
 + * Outputs:
 + *    None.
 + * Returns:
 + *    zero for success, a kdb diagnostic if error
 + * Locking:
 + *    none.
 + * Remarks:
 + *    If no percentage is supplied, it uses 60.
 + */
 +
 +static void
 +kdba_stackdepth1(struct task_struct *p, unsigned long sp)
 +{
 +      struct thread_info *tinfo;
 +      int used;
 +      const char *type;
 +      kdb_ps1(p);
 +      do {
 +              tinfo = (struct thread_info *)(sp & -THREAD_SIZE);
 +              used = sizeof(*tinfo) + THREAD_SIZE - (sp & (THREAD_SIZE-1));
 +              type = NULL;
 +              if (kdb_task_has_cpu(p)) {
 +                      struct kdb_activation_record ar;
 +                      memset(&ar, 0, sizeof(ar));
 +                      kdba_get_stack_info_alternate(sp, -1, &ar);
 +                      type = ar.stack.id;
 +              }
 +              if (!type)
 +                      type = "process";
 +              kdb_printf("  %s stack %p sp %lx used %d\n", type, tinfo, sp, used);
 +              sp = tinfo->previous_esp;
 +      } while (sp);
 +}
 +
 +static int
 +kdba_stackdepth(int argc, const char **argv)
 +{
 +      int diag, cpu, threshold, used, over;
 +      unsigned long percentage;
 +      unsigned long esp;
 +      long offset = 0;
 +      int nextarg;
 +      struct task_struct *p, *g;
 +      struct kdb_running_process *krp;
 +      struct thread_info *tinfo;
 +
 +      if (argc == 0) {
 +              percentage = 60;
 +      } else if (argc == 1) {
 +              nextarg = 1;
 +              diag = kdbgetaddrarg(argc, argv, &nextarg, &percentage, &offset, NULL);
 +              if (diag)
 +                      return diag;
 +      } else {
 +              return KDB_ARGCOUNT;
 +      }
 +      percentage = max_t(int, percentage, 1);
 +      percentage = min_t(int, percentage, 100);
 +      threshold = ((2 * THREAD_SIZE * percentage) / 100 + 1) >> 1;
 +      kdb_printf("stackdepth: processes using more than %ld%% (%d bytes) of stack\n",
 +              percentage, threshold);
 +
 +      /* Run the active tasks first, they can have multiple stacks */
 +      for (cpu = 0, krp = kdb_running_process; cpu < NR_CPUS; ++cpu, ++krp) {
 +              if (!cpu_online(cpu))
 +                      continue;
 +              p = krp->p;
 +              esp = krp->arch.sp;
 +              over = 0;
 +              do {
 +                      tinfo = (struct thread_info *)(esp & -THREAD_SIZE);
 +                      used = sizeof(*tinfo) + THREAD_SIZE - (esp & (THREAD_SIZE-1));
 +                      if (used >= threshold)
 +                              over = 1;
 +                      esp = tinfo->previous_esp;
 +              } while (esp);
 +              if (over)
 +                      kdba_stackdepth1(p, krp->arch.sp);
 +      }
 +      /* Now the tasks that are not on cpus */
 +      kdb_do_each_thread(g, p) {
 +              if (kdb_task_has_cpu(p))
 +                      continue;
 +              esp = p->thread.sp;
 +              used = sizeof(*tinfo) + THREAD_SIZE - (esp & (THREAD_SIZE-1));
 +              over = used >= threshold;
 +              if (over)
 +                      kdba_stackdepth1(p, esp);
 +      } kdb_while_each_thread(g, p);
 +
 +      return 0;
 +}
 +#else /* CONFIG_X86_32 */
 +
 +
 +/*
 + * kdba_entry
 + *
 + *    This is the interface routine between
 + *    the notifier die_chain and kdb
 + */
 +static int kdba_entry( struct notifier_block *b, unsigned long val, void *v)
 +{
 +      struct die_args *args = v;
 +      int err, trap, ret = 0;
 +      struct pt_regs *regs;
 +
 +      regs = args->regs;
 +      err  = args->err;
 +      trap  = args->trapnr;
 +      switch (val){
 +#ifdef        CONFIG_SMP
 +              case DIE_NMI_IPI:
 +                      ret = kdb_ipi(regs, NULL);
 +                      break;
 +#endif        /* CONFIG_SMP */
 +              case DIE_OOPS:
 +                      ret = kdb(KDB_REASON_OOPS, err, regs);
 +                      break;
 +              case DIE_CALL:
 +                      ret = kdb(KDB_REASON_ENTER, err, regs);
 +                      break;
 +              case DIE_DEBUG:
 +                      ret = kdb(KDB_REASON_DEBUG, err, regs);
 +                      break;
 +              case DIE_NMIWATCHDOG:
 +                      ret = kdb(KDB_REASON_NMI, err, regs);
 +                      break;
 +              case DIE_INT3:
 +                       ret = kdb(KDB_REASON_BREAK, err, regs);
 +                      // falls thru
 +              default:
 +                      break;
 +      }
 +      return (ret ? NOTIFY_STOP : NOTIFY_DONE);
 +}
 +
 +/*
 + * notifier block for kdb entry
 + */
 +static struct notifier_block kdba_notifier = {
 +      .notifier_call = kdba_entry
 +};
 +#endif /* CONFIG_X86_32 */
 +
 +asmlinkage int kdb_call(void);
 +
 +/* Executed once on each cpu at startup. */
 +void
 +kdba_cpu_up(void)
 +{
 +}
 +
 +static int __init
 +kdba_arch_init(void)
 +{
 +      set_intr_gate(KDBENTER_VECTOR, kdb_call);
 +      return 0;
 +}
 +
 +arch_initcall(kdba_arch_init);
 +
 +/*
 + * kdba_init
 + *
 + *    Architecture specific initialization.
 + *
 + * Parameters:
 + *    None.
 + * Returns:
 + *    None.
 + * Locking:
 + *    None.
 + * Remarks:
 + *    None.
 + */
 +
 +void __init
 +kdba_init(void)
 +{
 +      kdba_arch_init();       /* Need to register KDBENTER_VECTOR early */
 +      kdb_register("pt_regs", kdba_pt_regs, "address", "Format struct pt_regs", 0);
 +#ifdef CONFIG_X86_32
 +      kdb_register("stackdepth", kdba_stackdepth, "[percentage]", "Print processes using >= stack percentage", 0);
 +#else
 +      register_die_notifier(&kdba_notifier);
 +#endif
 +      return;
 +}
 +
 +/*
 + * kdba_adjust_ip
 + *
 + *    Architecture specific adjustment of instruction pointer before leaving
 + *    kdb.
 + *
 + * Parameters:
 + *    reason          The reason KDB was invoked
 + *    error           The hardware-defined error code
 + *    regs            The exception frame at time of fault/breakpoint.  If reason
 + *                    is SILENT or CPU_UP then regs is NULL, otherwise it should
 + *                    always be valid.
 + * Returns:
 + *    None.
 + * Locking:
 + *    None.
 + * Remarks:
 + *    noop on ix86.
 + */
 +
 +void
 +kdba_adjust_ip(kdb_reason_t reason, int error, struct pt_regs *regs)
 +{
 +      return;
 +}
 +
 +void
 +kdba_set_current_task(const struct task_struct *p)
 +{
 +      kdb_current_task = p;
 +      if (kdb_task_has_cpu(p)) {
 +              struct kdb_running_process *krp = kdb_running_process + kdb_process_cpu(p);
 +              kdb_current_regs = krp->regs;
 +              return;
 +      }
 +      kdb_current_regs = NULL;
 +}
 +
 +#ifdef CONFIG_X86_32
 +/*
 + * asm-i386 uaccess.h supplies __copy_to_user which relies on MMU to
 + * trap invalid addresses in the _xxx fields.  Verify the other address
 + * of the pair is valid by accessing the first and last byte ourselves,
 + * then any access violations should only be caused by the _xxx
 + * addresses,
 + */
 +
 +int
 +kdba_putarea_size(unsigned long to_xxx, void *from, size_t size)
 +{
 +      mm_segment_t oldfs = get_fs();
 +      int r;
 +      char c;
 +      c = *((volatile char *)from);
 +      c = *((volatile char *)from + size - 1);
 +
 +      if (to_xxx < PAGE_OFFSET) {
 +              return kdb_putuserarea_size(to_xxx, from, size);
 +      }
 +
 +      set_fs(KERNEL_DS);
 +      r = __copy_to_user_inatomic((void __user *)to_xxx, from, size);
 +      set_fs(oldfs);
 +      return r;
 +}
 +
 +int
 +kdba_getarea_size(void *to, unsigned long from_xxx, size_t size)
 +{
 +      mm_segment_t oldfs = get_fs();
 +      int r;
 +      *((volatile char *)to) = '\0';
 +      *((volatile char *)to + size - 1) = '\0';
 +
 +      if (from_xxx < PAGE_OFFSET) {
 +              return kdb_getuserarea_size(to, from_xxx, size);
 +      }
 +
 +      set_fs(KERNEL_DS);
 +      switch (size) {
 +      case 1:
 +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 1);
 +              break;
 +      case 2:
 +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 2);
 +              break;
 +      case 4:
 +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 4);
 +              break;
 +      case 8:
 +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 8);
 +              break;
 +      default:
 +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, size);
 +              break;
 +      }
 +      set_fs(oldfs);
 +      return r;
 +}
 +
 +int
 +kdba_verify_rw(unsigned long addr, size_t size)
 +{
 +      unsigned char data[size];
 +      return(kdba_getarea_size(data, addr, size) || kdba_putarea_size(addr, data, size));
 +}
 +#endif /* CONFIG_X86_32 */
 +
 +#ifdef        CONFIG_SMP
 +
 +#include <asm/ipi.h>
 +
 +gate_desc save_idt[NR_VECTORS];
 +
 +void kdba_takeover_vector(int vector)
 +{
 +      memcpy(&save_idt[vector], &idt_table[vector], sizeof(gate_desc));
 +      set_intr_gate(KDB_VECTOR, kdb_interrupt);
 +      return;
 +}
 +
 +void kdba_giveback_vector(int vector)
 +{
 +      native_write_idt_entry(idt_table, vector, &save_idt[vector]);
 +      return;
 +}
 +
 +/* When first entering KDB, try a normal IPI.  That reduces backtrace problems
 + * on the other cpus.
 + */
 +void
 +smp_kdb_stop(void)
 +{
 +      if (!KDB_FLAG(NOIPI)) {
 +              kdba_takeover_vector(KDB_VECTOR);
 +              apic->send_IPI_allbutself(KDB_VECTOR);
 +      }
 +}
 +
 +/* The normal KDB IPI handler */
 +#ifdef CONFIG_X86_64
 +asmlinkage
 +#endif
 +void
 +smp_kdb_interrupt(struct pt_regs *regs)
 +{
 +      struct pt_regs *old_regs = set_irq_regs(regs);
 +      ack_APIC_irq();
 +      irq_enter();
 +      kdb_ipi(regs, NULL);
 +      irq_exit();
 +      set_irq_regs(old_regs);
 +}
 +
 +/* Invoked once from kdb_wait_for_cpus when waiting for cpus.  For those cpus
 + * that have not responded to the normal KDB interrupt yet, hit them with an
 + * NMI event.
 + */
 +void
 +kdba_wait_for_cpus(void)
 +{
 +      int c;
 +      if (KDB_FLAG(CATASTROPHIC))
 +              return;
 +      kdb_printf("  Sending NMI to non-responding cpus: ");
 +      for_each_online_cpu(c) {
 +              if (kdb_running_process[c].seqno < kdb_seqno - 1) {
 +                      kdb_printf(" %d", c);
 +                      apic->send_IPI_mask(cpumask_of(c), NMI_VECTOR);
 +              }
 +      }
 +      kdb_printf(".\n");
 +}
 +
 +#endif        /* CONFIG_SMP */
 +
 +#ifdef CONFIG_KDB_KDUMP
 +void kdba_kdump_prepare(struct pt_regs *regs)
 +{
 +      int i;
 +      struct pt_regs r;
 +      if (regs == NULL)
 +              regs = &r;
 +
 +      for (i = 1; i < NR_CPUS; ++i) {
 +              if (!cpu_online(i))
 +                      continue;
 +
 +              KDB_STATE_SET_CPU(KEXEC, i);
 +      }
 +
 +      machine_crash_shutdown(regs);
 +}
 +
 +extern void halt_current_cpu(struct pt_regs *);
 +
 +void kdba_kdump_shutdown_slave(struct pt_regs *regs)
 +{
- #ifndef CONFIG_PARAVIRT_XEN
++#ifndef CONFIG_XEN
 +      halt_current_cpu(regs);
 +#endif /* CONFIG_XEN */
 +}
 +
 +#endif /* CONFIG_KDB_KDUMP */
@@@ -1318,23 -1326,7 +1326,22 @@@ static int __init dmi_ignore_irq0_timer
        }
        return 0;
  }
- #endif
  
 +static int __init force_acpi_rsdt(const struct dmi_system_id *d)
 +{
 +      if (!acpi_force) {
 +              printk(KERN_NOTICE "%s detected: force use of acpi=rsdt\n",
 +                     d->ident);
 +              acpi_rsdt_forced = 1;
 +      } else {
 +              printk(KERN_NOTICE
 +                     "Warning: acpi=force overrules DMI blacklist: "
 +                     "acpi=rsdt\n");
 +      }
 +      return 0;
 +
 +}
 +
  /*
   * If your system is blacklisted here, but you find that acpi=force
   * works for you, please contact linux-acpi@vger.kernel.org
Simple merge
Simple merge
@@@ -2,9 -2,7 +2,8 @@@ obj-y                            =  mce.o mce-severity.
  
  obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
  obj-$(CONFIG_X86_MCE_INTEL)   += mce_intel.o
 +obj-$(CONFIG_X86_MCE_XEON75XX)        += mce-xeon75xx.o
  obj-$(CONFIG_X86_MCE_AMD)     += mce_amd.o
- obj-$(CONFIG_X86_XEN_MCE)     += mce_dom0.o
  obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
  obj-$(CONFIG_X86_MCE_INJECT)  += mce-inject.o
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1243,41 -1232,7 +1243,41 @@@ ENTRY(call_softirq
        CFI_ENDPROC
  END(call_softirq)
  
 +#ifdef CONFIG_STACK_UNWIND
 +ENTRY(arch_unwind_init_running)
 +      CFI_STARTPROC
 +      movq    %r15, R15(%rdi)
 +      movq    %r14, R14(%rdi)
 +      xchgq   %rsi, %rdx
 +      movq    %r13, R13(%rdi)
 +      movq    %r12, R12(%rdi)
 +      xorl    %eax, %eax
 +      movq    %rbp, RBP(%rdi)
 +      movq    %rbx, RBX(%rdi)
 +      movq    (%rsp), %r9
 +      xchgq   %rdx, %rcx
 +      movq    %rax, R11(%rdi)
 +      movq    %rax, R10(%rdi)
 +      movq    %rax, R9(%rdi)
 +      movq    %rax, R8(%rdi)
 +      movq    %rax, RAX(%rdi)
 +      movq    %rax, RCX(%rdi)
 +      movq    %rax, RDX(%rdi)
 +      movq    %rax, RSI(%rdi)
 +      movq    %rax, RDI(%rdi)
 +      movq    %rax, ORIG_RAX(%rdi)
 +      movq    %r9, RIP(%rdi)
 +      leaq    8(%rsp), %r9
 +      movq    $__KERNEL_CS, CS(%rdi)
 +      movq    %rax, EFLAGS(%rdi)
 +      movq    %r9, RSP(%rdi)
 +      movq    $__KERNEL_DS, SS(%rdi)
 +      jmpq    *%rcx
 +      CFI_ENDPROC
 +END(arch_unwind_init_running)
 +#endif
 +
- #ifdef CONFIG_PARAVIRT_XEN
+ #ifdef CONFIG_XEN
  zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
  
  /*
@@@ -1374,35 -1329,8 +1374,35 @@@ ENTRY(xen_failsafe_callback
        CFI_ENDPROC
  END(xen_failsafe_callback)
  
- #endif /* CONFIG_PARAVIRT_XEN */
+ #endif /* CONFIG_XEN */
  
 +#ifdef        CONFIG_KDB
 +
 +#ifdef CONFIG_SMP
 +apicinterrupt KDB_VECTOR \
 +      kdb_interrupt, smp_kdb_interrupt
 +#endif        /* CONFIG_SMP */
 +
 +ENTRY(kdb_call)
 +      INTR_FRAME
 +      cld
 +      pushq $-1                       # orig_eax
 +      CFI_ADJUST_CFA_OFFSET 8
 +      SAVE_ALL
 +      movq $1,%rdi                    # KDB_REASON_ENTER
 +      movq $0,%rsi                    # error_code
 +      movq %rsp,%rdx                  # struct pt_regs
 +      call kdb
 +      RESTORE_ALL
 +      addq $8,%rsp                    # forget orig_eax
 +      CFI_ADJUST_CFA_OFFSET -8
 +      iretq
 +      CFI_ENDPROC
 +END(kdb_call)
 +
 +#endif        /* CONFIG_KDB */
 +
 +
  /*
   * Some functions should be protected against kprobes
   */
Simple merge
  #include <asm/cacheflush.h>
  #include <asm/debugreg.h>
  
- #ifdef CONFIG_XEN
- #include <xen/interface/kexec.h>
- #endif
 -static void set_idt(void *newidt, __u16 limit)
 -{
 -      struct desc_ptr curidt;
 -
 -      /* ia32 supports unaliged loads & stores */
 -      curidt.size    = limit;
 -      curidt.address = (unsigned long)newidt;
 -
 -      load_idt(&curidt);
 -}
 -
 -
 -static void set_gdt(void *newgdt, __u16 limit)
 -{
 -      struct desc_ptr curgdt;
 -
 -      /* ia32 supports unaligned loads & stores */
 -      curgdt.size    = limit;
 -      curgdt.address = (unsigned long)newgdt;
 -
 -      load_gdt(&curgdt);
 -}
 -
 -static void load_segments(void)
 -{
 -#define __STR(X) #X
 -#define STR(X) __STR(X)
 -
 -      __asm__ __volatile__ (
 -              "\tljmp $"STR(__KERNEL_CS)",$1f\n"
 -              "\t1:\n"
 -              "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
 -              "\tmovl %%eax,%%ds\n"
 -              "\tmovl %%eax,%%es\n"
 -              "\tmovl %%eax,%%fs\n"
 -              "\tmovl %%eax,%%gs\n"
 -              "\tmovl %%eax,%%ss\n"
 -              : : : "eax", "memory");
 -#undef STR
 -#undef __STR
 -}
 -
  static void machine_kexec_free_page_tables(struct kimage *image)
  {
        free_page((unsigned long)image->arch.pgd);
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -54,6 -55,6 +55,6 @@@ EXPORT_SYMBOL(__memcpy)
  
  EXPORT_SYMBOL(empty_zero_page);
  EXPORT_SYMBOL(init_level4_pgt);
- #if !defined(CONFIG_PARAVIRT_CPU) && !defined(CONFIG_XEN)
 -#ifndef CONFIG_PARAVIRT
++#ifndef CONFIG_PARAVIRT_CPU
  EXPORT_SYMBOL(native_load_gs_index);
  #endif
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -2,9 -2,9 +2,9 @@@
  # This Kconfig describes xen options
  #
  
- config PARAVIRT_XEN
+ config XEN
        bool "Xen guest support"
 -      select PARAVIRT
 +      select PARAVIRT_ALL
        select PARAVIRT_CLOCK
        depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
        depends on X86_CMPXCHG && X86_TSC
@@@ -33,7 -40,7 +33,6 @@@ obj-$(CONFIG_PARPORT)         += parport
  obj-y                         += base/ block/ misc/ mfd/
  obj-$(CONFIG_NUBUS)           += nubus/
  obj-y                         += macintosh/
- obj-$(CONFIG_XEN)             += xen/
 -obj-$(CONFIG_IDE)             += ide/
  obj-$(CONFIG_SCSI)            += scsi/
  obj-$(CONFIG_ATA)             += ata/
  obj-$(CONFIG_MTD)             += mtd/
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -35,9 -35,7 +35,9 @@@ obj-$(CONFIG_BLK_DEV_SX8)     += sx8.
  obj-$(CONFIG_BLK_DEV_UB)      += ub.o
  obj-$(CONFIG_BLK_DEV_HD)      += hd.o
  
- obj-$(CONFIG_XEN_BLKFRONT)    += xen-blkfront.o
+ obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     += xen-blkfront.o
  obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
  
 +obj-$(CONFIG_CIPHER_TWOFISH)  += loop_fish2.o
 +
  swim_mod-objs := swim.o swim_asm.o
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1280,15 -1407,18 +1407,19 @@@ static const struct of_device_id fsldma
        {}
  };
  
- static struct of_platform_driver of_fsl_dma_driver = {
-       .owner = THIS_MODULE,
-       .name = "fsl-elo-dma",
-       .match_table = of_fsl_dma_ids,
-       .probe = of_fsl_dma_probe,
-       .remove = of_fsl_dma_remove,
+ static struct of_platform_driver fsldma_of_driver = {
++      .owner          = THIS_MODULE,
+       .name           = "fsl-elo-dma",
+       .match_table    = fsldma_of_ids,
+       .probe          = fsldma_of_probe,
+       .remove         = fsldma_of_remove,
  };
  
- static __init int of_fsl_dma_init(void)
+ /*----------------------------------------------------------------------------*/
+ /* Module Init / Exit                                                         */
+ /*----------------------------------------------------------------------------*/
+ static __init int fsldma_init(void)
  {
        int ret;
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -610,57 -607,30 +617,57 @@@ static struct pgpath *parse_path(struc
        if (!p)
                return ERR_PTR(-ENOMEM);
  
 -      r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
 +      path = shift(as);
-       r = dm_get_device(ti, path, ti->begin, ti->len,
-                         dm_table_get_mode(ti->table), &p->path.dev);
++      r = dm_get_device(ti, path, dm_table_get_mode(ti->table),
+                         &p->path.dev);
        if (r) {
 -              ti->error = "error getting device";
 -              goto bad;
 +              unsigned major, minor;
 +
 +              /* Try to add a failed device */
 +              if (r == -ENXIO && sscanf(path, "%u:%u", &major, &minor) == 2) {
 +                      dev_t dev;
 +
 +                      /* Extract the major/minor numbers */
 +                      dev = MKDEV(major, minor);
 +                      if (MAJOR(dev) != major || MINOR(dev) != minor) {
 +                              /* Nice try, didn't work */
 +                              DMWARN("Invalid device path %s", path);
 +                              ti->error = "error converting devnum";
 +                              goto bad;
 +                      }
 +                      DMWARN("adding disabled device %d:%d", major, minor);
 +                      p->path.dev = NULL;
 +                      format_dev_t(p->path.pdev, dev);
 +                      p->is_active = 0;
 +              } else {
 +                      ti->error = "error getting device";
 +                      goto bad;
 +              }
 +      } else {
 +              memcpy(p->path.pdev, p->path.dev->name, 16);
        }
  
 -      if (m->hw_handler_name) {
 +      if (p->path.dev) {
                struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
  
 -              r = scsi_dh_attach(q, m->hw_handler_name);
 -              if (r == -EBUSY) {
 -                      /*
 -                       * Already attached to different hw_handler,
 -                       * try to reattach with correct one.
 -                       */
 -                      scsi_dh_detach(q);
 +              if (m->hw_handler_name) {
                        r = scsi_dh_attach(q, m->hw_handler_name);
 -              }
 -
 -              if (r < 0) {
 -                      ti->error = "error attaching hardware handler";
 -                      dm_put_device(ti, p->path.dev);
 -                      goto bad;
 +                      if (r == -EBUSY) {
 +                              /*
 +                               * Already attached to different hw_handler,
 +                               * try to reattach with correct one.
 +                               */
 +                              scsi_dh_detach(q);
 +                              r = scsi_dh_attach(q, m->hw_handler_name);
 +                      }
 +                      if (r < 0) {
 +                              ti->error = "error attaching hardware handler";
 +                              dm_put_device(ti, p->path.dev);
 +                              goto bad;
 +                      }
 +              } else {
 +                      /* Play safe and detach hardware handler */
 +                      scsi_dh_detach(q);
                }
  
                if (m->hw_handler_params) {
@@@ -1204,8 -1177,8 +1235,9 @@@ static void pg_init_done(void *data, in
                        errors = 0;
                        break;
                }
-               DMERR("Cannot failover device %s because scsi_dh_%s was not "
-                     "loaded.", pgpath->path.pdev, m->hw_handler_name);
 -              DMERR("Could not failover the device: Handler scsi_dh_%s "
 -                    "Error %d.", m->hw_handler_name, errors);
++              DMERR("Count not failover device %s: Handler scsi_dh_%s "
++                    "was not loaded.", pgpath->path.pdev,
++                    m->hw_handler_name);
                /*
                 * Fail path for now, so we do not ping pong
                 */
@@@ -1263,47 -1241,8 +1305,47 @@@ static void activate_path(struct work_s
        struct pgpath *pgpath =
                container_of(work, struct pgpath, activate_path);
  
 -      scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
 -                              pg_init_done, pgpath);
 +      if (pgpath->path.dev)
 +              scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
-                                pg_init_done, &pgpath->path);
++                               pg_init_done, pgpath);
 +}
 +
 +/*
 + * Evaluate scsi return code
 + */
 +static int eval_scsi_error(int result, char *sense, int sense_len)
 +{
 +      struct scsi_sense_hdr sshdr;
 +      int r = DM_ENDIO_REQUEUE;
 +
 +      if (host_byte(result) != DID_OK)
 +              return r;
 +
 +      if (msg_byte(result) != COMMAND_COMPLETE)
 +              return r;
 +
 +      if (status_byte(result) == RESERVATION_CONFLICT)
 +              /* Do not retry here, possible data corruption */
 +              return -EIO;
 +
 +#if defined(CONFIG_SCSI) || defined(CONFIG_SCSI_MODULE)
 +      if (status_byte(result) == CHECK_CONDITION &&
 +          !scsi_normalize_sense(sense, sense_len, &sshdr)) {
 +
 +              switch (sshdr.sense_key) {
 +              case MEDIUM_ERROR:
 +              case DATA_PROTECT:
 +              case BLANK_CHECK:
 +              case COPY_ABORTED:
 +              case VOLUME_OVERFLOW:
 +              case MISCOMPARE:
 +                      r = -EIO;
 +                      break;
 +              }
 +      }
 +#endif
 +
 +      return r;
  }
  
  /*
index a780346,0000000..eb5ae0a
mode 100644,000000..100644
--- /dev/null
@@@ -1,4523 -1,0 +1,4522 @@@
 +/*
 + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
 + *
 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
 + *
 + * This file is released under the GPL.
 + *
 + *
 + * Linux 2.6 Device Mapper RAID4 and RAID5 target.
 + *
 + * Supports:
 + *    o RAID4 with dedicated and selectable parity device
 + *    o RAID5 with rotating parity (left+right, symmetric+asymmetric)
 + *    o run time optimization of xor algorithm used to calculate parity
 + *
 + *
 + * Thanks to MD for:
 + *    o the raid address calculation algorithm
 + *    o the base of the biovec <-> page list copier.
 + *
 + *
 + * Uses region hash to keep track of how many writes are in flight to
 + * regions in order to use dirty log to keep state of regions to recover:
 + *
 + *    o clean regions (those which are synchronized
 + *    and don't have write io in flight)
 + *    o dirty regions (those with write io in flight)
 + *
 + *
 + * On startup, any dirty regions are migrated to the 'nosync' state
 + * and are subject to recovery by the daemon.
 + *
 + * See raid_ctr() for table definition.
 + *
 + *
 + * FIXME:
 + * o add virtual interface for locking
 + * o remove instrumentation (REMOVEME:)
 + *
 + */
 +
 +static const char *version = "v0.2431";
 +
 +#include "dm.h"
 +#include "dm-memcache.h"
 +#include "dm-message.h"
 +#include "dm-raid45.h"
 +
 +#include <linux/kernel.h>
 +#include <linux/vmalloc.h>
 +
 +#include <linux/dm-io.h>
 +#include <linux/dm-dirty-log.h>
 +#include <linux/dm-region-hash.h>
 +
 +/* # of parallel recovered regions */
 +/* FIXME: cope with multiple recovery stripes in raid_set struct. */
 +#define MAX_RECOVER   1 /* needs to be 1! */
 +
 +/*
 + * Configurable parameters
 + */
 +#define       INLINE
 +
 +/* Default # of stripes if not set in constructor. */
 +#define       STRIPES                 64
 +
 +/* Minimum/maximum # of selectable stripes. */
 +#define       STRIPES_MIN             8
 +#define       STRIPES_MAX             16384
 +
 +/* Default chunk size in sectors if not set in constructor. */
 +#define       CHUNK_SIZE              64
 +
 +/* Default io size in sectors if not set in constructor. */
 +#define       IO_SIZE_MIN             SECTORS_PER_PAGE
 +#define       IO_SIZE                 IO_SIZE_MIN
 +
 +/* Maximum setable chunk size in sectors. */
 +#define       CHUNK_SIZE_MAX          16384
 +
 +/* Recover io size default in sectors. */
 +#define       RECOVER_IO_SIZE_MIN     64
 +#define       RECOVER_IO_SIZE         256
 +
 +/* Default percentage recover io bandwidth. */
 +#define       BANDWIDTH               10
 +#define       BANDWIDTH_MIN           1
 +#define       BANDWIDTH_MAX           100
 +/*
 + * END Configurable parameters
 + */
 +
 +#define       TARGET  "dm-raid45"
 +#define       DAEMON  "kraid45d"
 +#define       DM_MSG_PREFIX   TARGET
 +
 +#define       SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
 +
 +/* Amount/size for __xor(). */
 +#define       SECTORS_PER_XOR SECTORS_PER_PAGE
 +#define       XOR_SIZE        PAGE_SIZE
 +
 +/* Derive raid_set from stripe_cache pointer. */
 +#define       RS(x)   container_of(x, struct raid_set, sc)
 +
 +/* Check value in range. */
 +#define       range_ok(i, min, max)   (i >= min && i <= max)
 +
 +/* Page reference. */
 +#define PAGE(stripe, p)       ((stripe)->obj[p].pl->page)
 +
 +/* Bio list reference. */
 +#define       BL(stripe, p, rw)       (stripe->ss[p].bl + rw)
 +
 +/* Page list reference. */
 +#define       PL(stripe, p)           (stripe->obj[p].pl)
 +
 +/* Check argument is power of 2. */
 +#define POWER_OF_2(a) (!(a & (a - 1)))
 +
 +/* Factor out to dm-bio-list.h */
 +static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
 +{
 +      bio->bi_next = bl->head;
 +      bl->head = bio;
 +
 +      if (!bl->tail)
 +              bl->tail = bio;
 +}
 +
 +/* Factor out to dm.h */
 +#define TI_ERR_RET(str, ret) \
 +      do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
 +#define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
 +
 +/*-----------------------------------------------------------------
 + * Stripe cache
 + *
 + * Cache for all reads and writes to raid sets (operational or degraded)
 + *
 + * We need to run all data to and from a RAID set through this cache,
 + * because parity chunks need to get calculated from data chunks
 + * or, in the degraded/resynchronization case, missing chunks need
 + * to be reconstructed using the other chunks of the stripe.
 + *---------------------------------------------------------------*/
 +/* Protect kmem cache # counter. */
 +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
 +
 +/* A stripe set (holds bios hanging off). */
 +struct stripe_set {
 +      struct stripe *stripe;  /* Backpointer to stripe for endio(). */
 +      struct bio_list bl[3]; /* Reads, writes, and writes merged. */
 +#define       WRITE_MERGED    2
 +};
 +
 +#if READ != 0 || WRITE != 1
 +#error dm-raid45: READ/WRITE != 0/1 used as index!!!
 +#endif
 +
 +/*
 + * Stripe linked list indexes. Keep order, because the stripe
 + * and the stripe cache rely on the first 3!
 + */
 +enum list_types {
 +      LIST_IO = 0,    /* Stripes with io pending. */
 +      LIST_ENDIO,     /* Stripes to endio. */
 +      LIST_LRU,       /* Least recently used stripes. */
 +      LIST_HASH,      /* Hashed stripes. */
 +      LIST_RECOVER = LIST_HASH,       /* For recovery type stripes only. */
 +      NR_LISTS,       /* To size array in struct stripe. */
 +};
 +
 +enum lock_types {
 +      LOCK_ENDIO = 0, /* Protect endio list. */
 +      LOCK_LRU,       /* Protect lru list. */
 +      NR_LOCKS,       /* To size array in struct stripe_cache. */
 +};
 +
 +/* A stripe: the io object to handle all reads and writes to a RAID set. */
 +struct stripe {
 +      struct stripe_cache *sc;        /* Backpointer to stripe cache. */
 +
 +      sector_t key;           /* Hash key. */
 +      region_t region;        /* Region stripe is mapped to. */
 +
 +      /* Reference count. */
 +      atomic_t cnt;
 +
 +      struct {
 +              unsigned long flags;    /* flags (see below). */
 +
 +              /*
 +               * Pending ios in flight:
 +               *
 +               * used as a 'lock' to control move of stripe to endio list
 +               */
 +              atomic_t pending;       /* Pending ios in flight. */
 +
 +              /* Sectors to read and write for multi page stripe sets. */
 +              unsigned size;
 +      } io;
 +
 +      /* Lock on stripe (for clustering). */
 +      void *lock;
 +
 +      /*
 +       * 4 linked lists:
 +       *   o io list to flush io
 +       *   o endio list
 +       *   o LRU list to put stripes w/o reference count on
 +       *   o stripe cache hash
 +       */
 +      struct list_head lists[NR_LISTS];
 +
 +      struct {
 +              unsigned short parity;  /* Parity chunk index. */
 +              short recover;          /* Recovery chunk index. */
 +      } idx;
 +
 +      /* This sets memory cache object (dm-mem-cache). */
 +      struct dm_mem_cache_object *obj;
 +
 +      /* Array of stripe sets (dynamically allocated). */
 +      struct stripe_set ss[0];
 +};
 +
 +/* States stripes can be in (flags field). */
 +enum stripe_states {
 +      STRIPE_ACTIVE,          /* Active io on stripe. */
 +      STRIPE_ERROR,           /* io error on stripe. */
 +      STRIPE_MERGED,          /* Writes got merged. */
 +      STRIPE_READ,            /* Read. */
 +      STRIPE_RBW,             /* Read-before-write. */
 +      STRIPE_RECONSTRUCT,     /* reconstruct of a missing chunk required. */
 +      STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
 +};
 +
 +/* ... and macros to access them. */
 +#define       BITOPS(name, what, var, flag) \
 +static inline int TestClear ## name ## what(struct var *v) \
 +{ return test_and_clear_bit(flag, &v->io.flags); } \
 +static inline int TestSet ## name ## what(struct var *v) \
 +{ return test_and_set_bit(flag, &v->io.flags); } \
 +static inline void Clear ## name ## what(struct var *v) \
 +{ clear_bit(flag, &v->io.flags); } \
 +static inline void Set ## name ## what(struct var *v) \
 +{ set_bit(flag, &v->io.flags); } \
 +static inline int name ## what(struct var *v) \
 +{ return test_bit(flag, &v->io.flags); }
 +
 +
 +BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
 +BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
 +BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
 +BITOPS(Stripe, Read, stripe, STRIPE_READ)
 +BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
 +BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
 +BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
 +
 +/* A stripe hash. */
 +struct stripe_hash {
 +      struct list_head *hash;
 +      unsigned buckets;
 +      unsigned mask;
 +      unsigned prime;
 +      unsigned shift;
 +};
 +
 +/* A stripe cache. */
 +struct stripe_cache {
 +      /* Stripe hash. */
 +      struct stripe_hash hash;
 +
 +      /* Stripes with io to flush, stripes to endio and LRU lists. */
 +      struct list_head lists[3];
 +
 +      /* Locks to protect endio and lru lists. */
 +      spinlock_t locks[NR_LOCKS];
 +
 +      /* Slab cache to allocate stripes from. */
 +      struct {
 +              struct kmem_cache *cache;       /* Cache itself. */
 +              char name[32];  /* Unique name. */
 +      } kc;
 +
 +      struct dm_io_client *dm_io_client; /* dm-io client resource context. */
 +
 +      /* dm-mem-cache client resource context. */
 +      struct dm_mem_cache_client *mem_cache_client;
 +
 +      int stripes_parm;           /* # stripes parameter from constructor. */
 +      atomic_t stripes;           /* actual # of stripes in cache. */
 +      atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
 +      atomic_t stripes_last;      /* last # of stripes in cache. */
 +      atomic_t active_stripes;    /* actual # of active stripes in cache. */
 +
 +      /* REMOVEME: */
 +      atomic_t max_active_stripes; /* actual # of active stripes in cache. */
 +};
 +
 +/* Flag specs for raid_dev */ ;
 +enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
 +
 +/* The raid device in a set. */
 +struct raid_dev {
 +      struct dm_dev *dev;
 +      unsigned long flags;    /* raid_dev_flags. */
 +      sector_t start;         /* offset to map to. */
 +};
 +
 +/* Flags spec for raid_set. */
 +enum raid_set_flags {
 +      RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
 +      RS_DEAD,                /* RAID set inoperational. */
 +      RS_DEVEL_STATS,         /* REMOVEME: display status information. */
 +      RS_IO_ERROR,            /* io error on set. */
 +      RS_RECOVER,             /* Do recovery. */
 +      RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
 +      RS_REGION_GET,          /* get a region to recover. */
 +      RS_SC_BUSY,             /* stripe cache busy -> send an event. */
 +      RS_SUSPENDED,           /* RAID set suspendedn. */
 +};
 +
 +/* REMOVEME: devel stats counters. */
 +enum stats_types {
 +      S_BIOS_READ,
 +      S_BIOS_ADDED_READ,
 +      S_BIOS_ENDIO_READ,
 +      S_BIOS_WRITE,
 +      S_BIOS_ADDED_WRITE,
 +      S_BIOS_ENDIO_WRITE,
 +      S_CAN_MERGE,
 +      S_CANT_MERGE,
 +      S_CONGESTED,
 +      S_DM_IO_READ,
 +      S_DM_IO_WRITE,
 +      S_ACTIVE_READS,
 +      S_BANDWIDTH,
 +      S_BARRIER,
 +      S_BIO_COPY_PL_NEXT,
 +      S_DEGRADED,
 +      S_DELAYED_BIOS,
 +      S_EVICT,
 +      S_FLUSHS,
 +      S_HITS_1ST,
 +      S_IOS_POST,
 +      S_INSCACHE,
 +      S_MAX_LOOKUP,
 +      S_MERGE_PAGE_LOCKED,
 +      S_NO_BANDWIDTH,
 +      S_NOT_CONGESTED,
 +      S_NO_RW,
 +      S_NOSYNC,
 +      S_PROHIBITPAGEIO,
 +      S_RECONSTRUCT_EI,
 +      S_RECONSTRUCT_DEV,
 +      S_REDO,
 +      S_REQUEUE,
 +      S_STRIPE_ERROR,
 +      S_SUM_DELAYED_BIOS,
 +      S_XORS,
 +      S_NR_STATS,     /* # of stats counters. */
 +};
 +
 +/* Status type -> string mappings. */
 +struct stats_map {
 +      const enum stats_types type;
 +      const char *str;
 +};
 +
 +static struct stats_map stats_map[] = {
 +      { S_BIOS_READ, "r=" },
 +      { S_BIOS_ADDED_READ, "/" },
 +      { S_BIOS_ENDIO_READ, "/" },
 +      { S_BIOS_WRITE, " w=" },
 +      { S_BIOS_ADDED_WRITE, "/" },
 +      { S_BIOS_ENDIO_WRITE, "/" },
 +      { S_DM_IO_READ, " rc=" },
 +      { S_DM_IO_WRITE, " wc=" },
 +      { S_ACTIVE_READS, " active_reads=" },
 +      { S_BANDWIDTH, " bandwidth=" },
 +      { S_NO_BANDWIDTH, " no_bandwidth=" },
 +      { S_BARRIER, " barrier=" },
 +      { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
 +      { S_CAN_MERGE, " can_merge=" },
 +      { S_MERGE_PAGE_LOCKED, "/page_locked=" },
 +      { S_CANT_MERGE, "/cant_merge=" },
 +      { S_CONGESTED, " congested=" },
 +      { S_NOT_CONGESTED, "/not_congested=" },
 +      { S_DEGRADED, " degraded=" },
 +      { S_DELAYED_BIOS, " delayed_bios=" },
 +      { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
 +      { S_EVICT, " evict=" },
 +      { S_FLUSHS, " flushs=" },
 +      { S_HITS_1ST, " hits_1st=" },
 +      { S_IOS_POST, " ios_post=" },
 +      { S_INSCACHE, " inscache=" },
 +      { S_MAX_LOOKUP, " max_lookup=" },
 +      { S_NO_RW, " no_rw=" },
 +      { S_NOSYNC, " nosync=" },
 +      { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
 +      { S_RECONSTRUCT_EI, " reconstruct_ei=" },
 +      { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
 +      { S_REDO, " redo=" },
 +      { S_REQUEUE, " requeue=" },
 +      { S_STRIPE_ERROR, " stripe_error=" },
 +      { S_XORS, " xors=" },
 +};
 +
 +/*
 + * A RAID set.
 + */
 +typedef void (*xor_function_t)(unsigned count, unsigned long **data);
 +struct raid_set {
 +      struct dm_target *ti;   /* Target pointer. */
 +
 +      struct {
 +              unsigned long flags;    /* State flags. */
 +              spinlock_t in_lock;     /* Protects central input list below. */
 +              struct bio_list in;     /* Pending ios (central input list). */
 +              struct bio_list work;   /* ios work set. */
 +              wait_queue_head_t suspendq;     /* suspend synchronization. */
 +              atomic_t in_process;    /* counter of queued bios (suspendq). */
 +              atomic_t in_process_max;/* counter of queued bios max. */
 +
 +              /* io work. */
 +              struct workqueue_struct *wq;
 +              struct delayed_work dws;
 +      } io;
 +
 +      /* External locking. */
 +      struct dm_raid45_locking_type *locking;
 +
 +      struct stripe_cache sc; /* Stripe cache for this set. */
 +
 +      /* Xor optimization. */
 +      struct {
 +              struct xor_func *f;
 +              unsigned chunks;
 +              unsigned speed;
 +      } xor;
 +
 +      /* Recovery parameters. */
 +      struct recover {
 +              struct dm_dirty_log *dl;        /* Dirty log. */
 +              struct dm_region_hash *rh;      /* Region hash. */
 +
 +              /* dm-mem-cache client resource context for recovery stripes. */
 +              struct dm_mem_cache_client *mem_cache_client;
 +
 +              struct list_head stripes;       /* List of recovery stripes. */
 +
 +              region_t nr_regions;
 +              region_t nr_regions_to_recover;
 +              region_t nr_regions_recovered;
 +              unsigned long start_jiffies;
 +              unsigned long end_jiffies;
 +
 +              unsigned bandwidth;          /* Recovery bandwidth [%]. */
 +              unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
 +              unsigned bandwidth_parm; /*  " constructor parm. */
 +              unsigned io_size;        /* io size <= chunk size. */
 +              unsigned io_size_parm;   /* io size ctr parameter. */
 +
 +              /* recovery io throttling. */
 +              atomic_t io_count[2];   /* counter recover/regular io. */
 +              unsigned long last_jiffies;
 +
 +              struct dm_region *reg;  /* Actual region to recover. */
 +              sector_t pos;   /* Position within region to recover. */
 +              sector_t end;   /* End of region to recover. */
 +      } recover;
 +
 +      /* RAID set parameters. */
 +      struct {
 +              struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
 +              unsigned raid_parms;    /* # variable raid parameters. */
 +
 +              unsigned chunk_size;    /* Sectors per chunk. */
 +              unsigned chunk_size_parm;
 +              unsigned chunk_mask;    /* Mask for amount. */
 +              unsigned chunk_shift;   /* rsector chunk size shift. */
 +
 +              unsigned io_size;       /* Sectors per io. */
 +              unsigned io_size_parm;
 +              unsigned io_mask;       /* Mask for amount. */
 +              unsigned io_shift_mask; /* Mask for raid_address(). */
 +              unsigned io_shift;      /* rsector io size shift. */
 +              unsigned pages_per_io;  /* Pages per io. */
 +
 +              sector_t sectors_per_dev;       /* Sectors per device. */
 +
 +              atomic_t failed_devs;           /* Amount of devices failed. */
 +
 +              /* Index of device to initialize. */
 +              int dev_to_init;
 +              int dev_to_init_parm;
 +
 +              /* Raid devices dynamically allocated. */
 +              unsigned raid_devs;     /* # of RAID devices below. */
 +              unsigned data_devs;     /* # of RAID data devices. */
 +
 +              int ei;         /* index of failed RAID device. */
 +
 +              /* index of dedicated parity device (i.e. RAID4). */
 +              int pi;
 +              int pi_parm;    /* constructor parm for status output. */
 +      } set;
 +
 +      /* REMOVEME: devel stats counters. */
 +      atomic_t stats[S_NR_STATS];
 +
 +      /* Dynamically allocated temporary pointers for xor(). */
 +      unsigned long **data;
 +
 +      /* Dynamically allocated RAID devices. Alignment? */
 +      struct raid_dev dev[0];
 +};
 +
 +
 +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
 +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
 +BITOPS(RS, Dead, raid_set, RS_DEAD)
 +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
 +BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
 +BITOPS(RS, Recover, raid_set, RS_RECOVER)
 +BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
 +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
 +BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
 +#undef BITOPS
 +
 +#define       PageIO(page)            PageChecked(page)
 +#define       AllowPageIO(page)       SetPageChecked(page)
 +#define       ProhibitPageIO(page)    ClearPageChecked(page)
 +
 +/*-----------------------------------------------------------------
 + * Raid-4/5 set structures.
 + *---------------------------------------------------------------*/
 +/* RAID level definitions. */
 +enum raid_level {
 +      raid4,
 +      raid5,
 +};
 +
 +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
 +enum raid_algorithm {
 +      none,
 +      left_asym,
 +      right_asym,
 +      left_sym,
 +      right_sym,
 +};
 +
 +struct raid_type {
 +      const char *name;               /* RAID algorithm. */
 +      const char *descr;              /* Descriptor text for logging. */
 +      const unsigned parity_devs;     /* # of parity devices. */
 +      const unsigned minimal_devs;    /* minimal # of devices in set. */
 +      const enum raid_level level;            /* RAID level. */
 +      const enum raid_algorithm algorithm;    /* RAID algorithm. */
 +};
 +
 +/* Supported raid types and properties. */
 +static struct raid_type raid_types[] = {
 +      {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
 +      {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
 +      {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
 +      {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
 +      {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
 +};
 +
 +/* Address as calculated by raid_address(). */
 +struct address {
 +      sector_t key;           /* Hash key (start address of stripe). */
 +      unsigned di, pi;        /* Data and parity disks index. */
 +};
 +
 +/* REMOVEME: reset statistics counters. */
 +static void stats_reset(struct raid_set *rs)
 +{
 +      unsigned s = S_NR_STATS;
 +
 +      while (s--)
 +              atomic_set(rs->stats + s, 0);
 +}
 +
 +/*----------------------------------------------------------------
 + * RAID set management routines.
 + *--------------------------------------------------------------*/
 +/*
 + * Begin small helper functions.
 + */
 +/* Queue (optionally delayed) io work. */
 +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
 +{
 +      struct delayed_work *dws = &rs->io.dws;
 +
 +      cancel_delayed_work(dws);
 +      queue_delayed_work(rs->io.wq, dws, delay);
 +}
 +
 +/* Queue io work immediately (called from region hash too). */
 +static INLINE void wake_do_raid(void *context)
 +{
 +      wake_do_raid_delayed(context, 0);
 +}
 +
 +/* Wait until all io has been processed. */
 +static INLINE void wait_ios(struct raid_set *rs)
 +{
 +      wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
 +}
 +
 +/* Declare io queued to device. */
 +static INLINE void io_dev_queued(struct raid_dev *dev)
 +{
 +      set_bit(IO_QUEUED, &dev->flags);
 +}
 +
 +/* Io on device and reset ? */
 +static inline int io_dev_clear(struct raid_dev *dev)
 +{
 +      return test_and_clear_bit(IO_QUEUED, &dev->flags);
 +}
 +
 +/* Get an io reference. */
 +static INLINE void io_get(struct raid_set *rs)
 +{
 +      int p = atomic_inc_return(&rs->io.in_process);
 +
 +      if (p > atomic_read(&rs->io.in_process_max))
 +              atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
 +}
 +
 +/* Put the io reference and conditionally wake io waiters. */
 +static INLINE void io_put(struct raid_set *rs)
 +{
 +      /* Intel: rebuild data corrupter? */
 +      if (!atomic_read(&rs->io.in_process)) {
 +              DMERR("%s would go negative!!!", __func__);
 +              return;
 +      }
 +
 +      if (atomic_dec_and_test(&rs->io.in_process))
 +              wake_up(&rs->io.suspendq);
 +}
 +
 +/* Calculate device sector offset. */
 +static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
 +{
 +      sector_t sector = bio->bi_sector;
 +
 +      sector_div(sector, rs->set.data_devs);
 +      return sector;
 +}
 +
 +/* Test device operational. */
 +static INLINE int dev_operational(struct raid_set *rs, unsigned p)
 +{
 +      return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
 +}
 +
 +/* Return # of active stripes in stripe cache. */
 +static INLINE int sc_active(struct stripe_cache *sc)
 +{
 +      return atomic_read(&sc->active_stripes);
 +}
 +
 +/* Test io pending on stripe. */
 +static INLINE int stripe_io(struct stripe *stripe)
 +{
 +      return atomic_read(&stripe->io.pending);
 +}
 +
 +static INLINE void stripe_io_inc(struct stripe *stripe)
 +{
 +      atomic_inc(&stripe->io.pending);
 +}
 +
 +static INLINE void stripe_io_dec(struct stripe *stripe)
 +{
 +      atomic_dec(&stripe->io.pending);
 +}
 +
 +/* Wrapper needed by for_each_io_dev(). */
 +static void _stripe_io_inc(struct stripe *stripe, unsigned p)
 +{
 +      stripe_io_inc(stripe);
 +}
 +
 +/* Error a stripe. */
 +static INLINE void stripe_error(struct stripe *stripe, struct page *page)
 +{
 +      SetStripeError(stripe);
 +      SetPageError(page);
 +      atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
 +}
 +
 +/* Page IOed ok. */
 +enum dirty_type { CLEAN, DIRTY };
 +static INLINE void page_set(struct page *page, enum dirty_type type)
 +{
 +      switch (type) {
 +      case DIRTY:
 +              SetPageDirty(page);
 +              AllowPageIO(page);
 +              break;
 +
 +      case CLEAN:
 +              ClearPageDirty(page);
 +              break;
 +
 +      default:
 +              BUG();
 +      }
 +
 +      SetPageUptodate(page);
 +      ClearPageError(page);
 +}
 +
 +/* Return region state for a sector. */
 +static INLINE int
 +region_state(struct raid_set *rs, sector_t sector, unsigned long state)
 +{
 +      struct dm_region_hash *rh = rs->recover.rh;
 +
 +      return RSRecover(rs) ?
 +             (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
 +              state) : 0;
 +}
 +
 +/* Check maximum devices which may fail in a raid set. */
 +static inline int raid_set_degraded(struct raid_set *rs)
 +{
 +      return RSIoError(rs);
 +}
 +
 +/* Check # of devices which may fail in a raid set. */
 +static INLINE int raid_set_operational(struct raid_set *rs)
 +{
 +      /* Too many failed devices -> BAD. */
 +      return atomic_read(&rs->set.failed_devs) <=
 +             rs->set.raid_type->parity_devs;
 +}
 +
 +/*
 + * Return true in case a page_list should be read/written
 + *
 + * Conditions to read/write:
 + *    o 1st page in list not uptodate
 + *    o 1st page in list dirty
 + *    o if we optimized io away, we flag it using the pages checked bit.
 + */
 +static INLINE unsigned page_io(struct page *page)
 +{
 +      /* Optimization: page was flagged to need io during first run. */
 +      if (PagePrivate(page)) {
 +              ClearPagePrivate(page);
 +              return 1;
 +      }
 +
 +      /* Avoid io if prohibited or a locked page. */
 +      if (!PageIO(page) || PageLocked(page))
 +              return 0;
 +
 +      if (!PageUptodate(page) || PageDirty(page)) {
 +              /* Flag page needs io for second run optimization. */
 +              SetPagePrivate(page);
 +              return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Call a function on each page list needing io. */
 +static INLINE unsigned
 +for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
 +              void (*f_io)(struct stripe *stripe, unsigned p))
 +{
 +      unsigned p = rs->set.raid_devs, r = 0;
 +
 +      while (p--) {
 +              if (page_io(PAGE(stripe, p))) {
 +                      f_io(stripe, p);
 +                      r++;
 +              }
 +      }
 +
 +      return r;
 +}
 +
 +/* Reconstruct a particular device ?. */
 +static INLINE int dev_to_init(struct raid_set *rs)
 +{
 +      return rs->set.dev_to_init > -1;
 +}
 +
 +/*
 + * Index of device to calculate parity on.
 + * Either the parity device index *or* the selected device to init
 + * after a spare replacement.
 + */
 +static INLINE unsigned dev_for_parity(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +
 +      return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
 +}
 +
 +/* Return the index of the device to be recovered. */
 +static int idx_get(struct raid_set *rs)
 +{
 +      /* Avoid to read in the pages to be reconstructed anyway. */
 +      if (dev_to_init(rs))
 +              return rs->set.dev_to_init;
 +      else if (rs->set.raid_type->level == raid4)
 +              return rs->set.pi;
 +
 +      return -1;
 +}
 +
 +/* RAID set congested function. */
 +static int raid_set_congested(void *congested_data, int bdi_bits)
 +{
 +      struct raid_set *rs = congested_data;
 +      int r = 0; /* Assume uncongested. */
 +      unsigned p = rs->set.raid_devs;
 +
 +      /* If any of our component devices are overloaded. */
 +      while (p--) {
 +              struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
 +
 +              r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 +      }
 +
 +      /* REMOVEME: statistics. */
 +      atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
 +      return r;
 +}
 +
 +/* Display RAID set dead message once. */
 +static void raid_set_dead(struct raid_set *rs)
 +{
 +      if (!TestSetRSDead(rs)) {
 +              unsigned p;
 +              char buf[BDEVNAME_SIZE];
 +
 +              DMERR("FATAL: too many devices failed -> RAID set dead");
 +
 +              for (p = 0; p < rs->set.raid_devs; p++) {
 +                      if (!dev_operational(rs, p))
 +                              DMERR("device /dev/%s failed",
 +                                    bdevname(rs->dev[p].dev->bdev, buf));
 +              }
 +      }
 +}
 +
 +/* RAID set degrade check. */
 +static INLINE int
 +raid_set_check_and_degrade(struct raid_set *rs,
 +                         struct stripe *stripe, unsigned p)
 +{
 +      if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
 +              return -EPERM;
 +
 +      /* Through an event in case of member device errors. */
 +      dm_table_event(rs->ti->table);
 +      atomic_inc(&rs->set.failed_devs);
 +
 +      /* Only log the first member error. */
 +      if (!TestSetRSIoError(rs)) {
 +              char buf[BDEVNAME_SIZE];
 +
 +              /* Store index for recovery. */
 +              mb();
 +              rs->set.ei = p;
 +              mb();
 +
 +              DMERR("CRITICAL: %sio error on device /dev/%s "
 +                    "in region=%llu; DEGRADING RAID set",
 +                    stripe ? "" : "FAKED ",
 +                    bdevname(rs->dev[p].dev->bdev, buf),
 +                    (unsigned long long) (stripe ? stripe->key : 0));
 +              DMERR("further device error messages suppressed");
 +      }
 +
 +      return 0;
 +}
 +
 +static void
 +raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
 +{
 +      unsigned p = rs->set.raid_devs;
 +
 +      while (p--) {
 +              struct page *page = PAGE(stripe, p);
 +
 +              if (PageError(page)) {
 +                      ClearPageError(page);
 +                      raid_set_check_and_degrade(rs, stripe, p);
 +              }
 +      }
 +}
 +
 +/* RAID set upgrade check. */
 +static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
 +{
 +      if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
 +              return -EPERM;
 +
 +      if (atomic_dec_and_test(&rs->set.failed_devs)) {
 +              ClearRSIoError(rs);
 +              rs->set.ei = -1;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Lookup a RAID device by name or by major:minor number. */
 +union dev_lookup {
 +      const char *dev_name;
 +      struct raid_dev *dev;
 +};
 +enum lookup_type { byname, bymajmin, bynumber };
 +static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
 +                         union dev_lookup *dl)
 +{
 +      unsigned p;
 +
 +      /*
 +       * Must be an incremental loop, because the device array
 +       * can have empty slots still on calls from raid_ctr()
 +       */
 +      for (p = 0; p < rs->set.raid_devs; p++) {
 +              char buf[BDEVNAME_SIZE];
 +              struct raid_dev *dev = rs->dev + p;
 +
 +              if (!dev->dev)
 +                      break;
 +
 +              /* Format dev string appropriately if necessary. */
 +              if (by == byname)
 +                      bdevname(dev->dev->bdev, buf);
 +              else if (by == bymajmin)
 +                      format_dev_t(buf, dev->dev->bdev->bd_dev);
 +
 +              /* Do the actual check. */
 +              if (by == bynumber) {
 +                      if (dl->dev->dev->bdev->bd_dev ==
 +                          dev->dev->bdev->bd_dev)
 +                              return p;
 +              } else if (!strcmp(dl->dev_name, buf))
 +                      return p;
 +      }
 +
 +      return -ENODEV;
 +}
 +
 +/* End io wrapper. */
 +static INLINE void
 +_bio_endio(struct raid_set *rs, struct bio *bio, int error)
 +{
 +      /* REMOVEME: statistics. */
 +      atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
 +                 S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
 +      bio_endio(bio, error);
 +      io_put(rs);             /* Wake any suspend waiters. */
 +}
 +
 +/*
 + * End small helper functions.
 + */
 +
 +
 +/*
 + * Stripe hash functions
 + */
 +/* Initialize/destroy stripe hash. */
 +static int hash_init(struct stripe_hash *hash, unsigned stripes)
 +{
 +      unsigned buckets = 2, max_buckets = stripes / 4;
 +      unsigned hash_primes[] = {
 +              /* Table of primes for hash_fn/table size optimization. */
 +              3, 7, 13, 27, 53, 97, 193, 389, 769,
 +              1543, 3079, 6151, 12289, 24593,
 +      };
 +
 +      /* Calculate number of buckets (2^^n <= stripes / 4). */
 +      while (buckets < max_buckets)
 +              buckets <<= 1;
 +
 +      /* Allocate stripe hash. */
 +      hash->hash = vmalloc(buckets * sizeof(*hash->hash));
 +      if (!hash->hash)
 +              return -ENOMEM;
 +
 +      hash->buckets = buckets;
 +      hash->mask = buckets - 1;
 +      hash->shift = ffs(buckets);
 +      if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
 +              hash->shift = ARRAY_SIZE(hash_primes) + 1;
 +
 +      BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
 +      hash->prime = hash_primes[hash->shift - 2];
 +
 +      /* Initialize buckets. */
 +      while (buckets--)
 +              INIT_LIST_HEAD(hash->hash + buckets);
 +
 +      return 0;
 +}
 +
 +static INLINE void hash_exit(struct stripe_hash *hash)
 +{
 +      if (hash->hash) {
 +              vfree(hash->hash);
 +              hash->hash = NULL;
 +      }
 +}
 +
 +/* List add (head/tail/locked/unlocked) inlines. */
 +enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
 +#define       LIST_DEL(name, list) \
 +static void stripe_ ## name ## _del(struct stripe *stripe, \
 +                                  enum list_lock_type lock) { \
 +      struct list_head *lh = stripe->lists + (list); \
 +      spinlock_t *l = NULL; \
 +\
 +      if (lock == LIST_LOCKED) { \
 +              l = stripe->sc->locks + LOCK_LRU; \
 +              spin_lock_irq(l); \
 +      } \
 +\
 +\
 +      if (!list_empty(lh)) \
 +              list_del_init(lh); \
 +\
 +      if (lock == LIST_LOCKED) \
 +              spin_unlock_irq(l); \
 +}
 +
 +LIST_DEL(hash, LIST_HASH)
 +LIST_DEL(lru, LIST_LRU)
 +#undef LIST_DEL
 +
 +enum list_pos_type { POS_HEAD, POS_TAIL };
 +#define       LIST_ADD(name, list) \
 +static void stripe_ ## name ## _add(struct stripe *stripe, \
 +                                  enum list_pos_type pos, \
 +                                  enum list_lock_type lock) { \
 +      struct list_head *lh = stripe->lists + (list); \
 +      struct stripe_cache *sc = stripe->sc; \
 +      spinlock_t *l = NULL; \
 +\
 +      if (lock == LIST_LOCKED) { \
 +              l = sc->locks + LOCK_LRU; \
 +              spin_lock_irq(l); \
 +      } \
 +\
 +      if (list_empty(lh)) { \
 +              if (pos == POS_HEAD) \
 +                      list_add(lh, sc->lists + (list)); \
 +              else \
 +                      list_add_tail(lh, sc->lists + (list)); \
 +      } \
 +\
 +      if (lock == LIST_LOCKED) \
 +              spin_unlock_irq(l); \
 +}
 +
 +LIST_ADD(endio, LIST_ENDIO)
 +LIST_ADD(io, LIST_IO)
 +LIST_ADD(lru, LIST_LRU)
 +#undef LIST_ADD
 +
 +#define POP(list) \
 +      do { \
 +              if (list_empty(sc->lists + list)) \
 +                      stripe = NULL; \
 +              else { \
 +                      stripe = list_first_entry(&sc->lists[list], \
 +                                                struct stripe, \
 +                                                lists[list]); \
 +                      list_del_init(&stripe->lists[list]); \
 +              } \
 +      } while (0);
 +
 +/* Pop an available stripe off the lru list. */
 +static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
 +{
 +      struct stripe *stripe;
 +      spinlock_t *lock = sc->locks + LOCK_LRU;
 +
 +      spin_lock_irq(lock);
 +      POP(LIST_LRU);
 +      spin_unlock_irq(lock);
 +
 +      if (stripe)
 +              /* Remove from hash before reuse. */
 +              stripe_hash_del(stripe, LIST_UNLOCKED);
 +
 +      return stripe;
 +}
 +
 +static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
 +{
 +      return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
 +}
 +
 +static inline struct list_head *
 +hash_bucket(struct stripe_hash *hash, sector_t key)
 +{
 +      return hash->hash + hash_fn(hash, key);
 +}
 +
 +/* Insert an entry into a hash. */
 +static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
 +{
 +      list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
 +}
 +
 +/* Insert an entry into the stripe hash. */
 +static inline void
 +sc_insert(struct stripe_cache *sc, struct stripe *stripe)
 +{
 +      hash_insert(&sc->hash, stripe);
 +}
 +
 +/* Lookup an entry in the stripe hash. */
 +static inline struct stripe *
 +stripe_lookup(struct stripe_cache *sc, sector_t key)
 +{
 +      unsigned c = 0;
 +      struct stripe *stripe;
 +      struct list_head *bucket = hash_bucket(&sc->hash, key);
 +
 +      list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
 +              /* REMOVEME: statisics. */
 +              if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
 +                      atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
 +
 +              if (stripe->key == key)
 +                      return stripe;
 +      }
 +
 +      return NULL;
 +}
 +
 +/* Resize the stripe cache hash on size changes. */
 +static int hash_resize(struct stripe_cache *sc)
 +{
 +      /* Resize threshold reached? */
 +      if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
 +          || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
 +              int r;
 +              struct stripe_hash hash, hash_tmp;
 +              spinlock_t *lock;
 +
 +              r = hash_init(&hash, atomic_read(&sc->stripes));
 +              if (r)
 +                      return r;
 +
 +              lock = sc->locks + LOCK_LRU;
 +              spin_lock_irq(lock);
 +              if (sc->hash.hash) {
 +                      unsigned b = sc->hash.buckets;
 +                      struct list_head *pos, *tmp;
 +
 +                      /* Walk old buckets and insert into new. */
 +                      while (b--) {
 +                              list_for_each_safe(pos, tmp, sc->hash.hash + b)
 +                                  hash_insert(&hash,
 +                                              list_entry(pos, struct stripe,
 +                                                         lists[LIST_HASH]));
 +                      }
 +
 +              }
 +
 +              memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
 +              memcpy(&sc->hash, &hash, sizeof(sc->hash));
 +              atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
 +              spin_unlock_irq(lock);
 +
 +              hash_exit(&hash_tmp);
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Stripe cache locking functions
 + */
 +/* Dummy lock function for local RAID4+5. */
 +static void *no_lock(sector_t key, enum dm_lock_type type)
 +{
 +      return &no_lock;
 +}
 +
 +/* Dummy unlock function for local RAID4+5. */
 +static void no_unlock(void *lock_handle)
 +{
 +}
 +
 +/* No locking (for local RAID 4+5). */
 +static struct dm_raid45_locking_type locking_none = {
 +      .lock = no_lock,
 +      .unlock = no_unlock,
 +};
 +
 +/* Clustered RAID 4+5. */
 +/* FIXME: code this. */
 +static struct dm_raid45_locking_type locking_cluster = {
 +      .lock = no_lock,
 +      .unlock = no_unlock,
 +};
 +
 +/* Lock a stripe (for clustering). */
 +static int
 +stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
 +{
 +      stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
 +                                                         DM_RAID45_EX);
 +      return stripe->lock ? 0 : -EPERM;
 +}
 +
 +/* Unlock a stripe (for clustering). */
 +static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
 +{
 +      rs->locking->unlock(stripe->lock);
 +      stripe->lock = NULL;
 +}
 +
 +/*
 + * Stripe cache functions.
 + */
 +/*
 + * Invalidate all page lists pages of a stripe.
 + *
 + * I only keep state for the whole list in the first page.
 + */
 +static INLINE void
 +stripe_pages_invalidate(struct stripe *stripe)
 +{
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--) {
 +              struct page *page = PAGE(stripe, p);
 +
 +              ProhibitPageIO(page);
 +              ClearPageChecked(page);
 +              ClearPageDirty(page);
 +              ClearPageError(page);
 +              __clear_page_locked(page);
 +              ClearPagePrivate(page);
 +              ClearPageUptodate(page);
 +      }
 +}
 +
 +/* Prepare stripe for (re)use. */
 +static INLINE void stripe_invalidate(struct stripe *stripe)
 +{
 +      stripe->io.flags = 0;
 +      stripe_pages_invalidate(stripe);
 +}
 +
 +/* Allow io on all chunks of a stripe. */
 +static INLINE void stripe_allow_io(struct stripe *stripe)
 +{
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--)
 +              AllowPageIO(PAGE(stripe, p));
 +}
 +
 +/* Initialize a stripe. */
 +static void
 +stripe_init(struct stripe_cache *sc, struct stripe *stripe)
 +{
 +      unsigned p = RS(sc)->set.raid_devs;
 +      unsigned i;
 +
 +      /* Work all io chunks. */
 +      while (p--) {
 +              struct stripe_set *ss = stripe->ss + p;
 +
 +              stripe->obj[p].private = ss;
 +              ss->stripe = stripe;
 +
 +              i = ARRAY_SIZE(ss->bl);
 +              while (i--)
 +                      bio_list_init(ss->bl + i);
 +      }
 +
 +      stripe->sc = sc;
 +
 +      i = ARRAY_SIZE(stripe->lists);
 +      while (i--)
 +              INIT_LIST_HEAD(stripe->lists + i);
 +
 +      atomic_set(&stripe->cnt, 0);
 +      atomic_set(&stripe->io.pending, 0);
 +
 +      stripe_invalidate(stripe);
 +}
 +
 +/* Number of pages per chunk. */
 +static inline unsigned chunk_pages(unsigned io_size)
 +{
 +      return dm_div_up(io_size, SECTORS_PER_PAGE);
 +}
 +
 +/* Number of pages per stripe. */
 +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
 +{
 +      return chunk_pages(io_size) * rs->set.raid_devs;
 +}
 +
 +/* Initialize part of page_list (recovery). */
 +static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
 +                                     unsigned start, unsigned count)
 +{
 +      unsigned pages = chunk_pages(count);
 +      /* Get offset into the page_list. */
 +      struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
 +
 +      BUG_ON(!pl);
 +      while (pl && pages--) {
 +              BUG_ON(!pl->page);
 +              memset(page_address(pl->page), 0, PAGE_SIZE);
 +              pl = pl->next;
 +      }
 +}
 +
 +/* Initialize parity chunk of stripe. */
 +static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
 +{
 +      stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
 +}
 +
 +/* Return dynamic stripe structure size. */
 +static INLINE size_t stripe_size(struct raid_set *rs)
 +{
 +      return sizeof(struct stripe) +
 +                    rs->set.raid_devs * sizeof(struct stripe_set);
 +}
 +
 +/* Allocate a stripe and its memory object. */
 +/* XXX adjust to cope with stripe cache and recovery stripe caches. */
 +enum grow { SC_GROW, SC_KEEP };
 +static struct stripe *stripe_alloc(struct stripe_cache *sc,
 +                                 struct dm_mem_cache_client *mc,
 +                                 enum grow grow)
 +{
 +      int r;
 +      struct stripe *stripe;
 +
 +      stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
 +      if (stripe) {
 +              /* Grow the dm-mem-cache by one object. */
 +              if (grow == SC_GROW) {
 +                      r = dm_mem_cache_grow(mc, 1);
 +                      if (r)
 +                              goto err_free;
 +              }
 +
 +              stripe->obj = dm_mem_cache_alloc(mc);
 +              if (!stripe->obj)
 +                      goto err_shrink;
 +
 +              stripe_init(sc, stripe);
 +      }
 +
 +      return stripe;
 +
 +err_shrink:
 +      if (grow == SC_GROW)
 +              dm_mem_cache_shrink(mc, 1);
 +err_free:
 +      kmem_cache_free(sc->kc.cache, stripe);
 +      return NULL;
 +}
 +
 +/*
 + * Free a stripes memory object, shrink the
 + * memory cache and free the stripe itself
 + */
 +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
 +{
 +      dm_mem_cache_free(mc, stripe->obj);
 +      dm_mem_cache_shrink(mc, 1);
 +      kmem_cache_free(stripe->sc->kc.cache, stripe);
 +}
 +
 +/* Free the recovery stripe. */
 +static void stripe_recover_free(struct raid_set *rs)
 +{
 +      struct recover *rec = &rs->recover;
 +      struct list_head *stripes = &rec->stripes;
 +
 +      while (!list_empty(stripes)) {
 +              struct stripe *stripe = list_first_entry(stripes, struct stripe,
 +                                                       lists[LIST_RECOVER]);
 +              list_del(stripe->lists + LIST_RECOVER);
 +              stripe_free(stripe, rec->mem_cache_client);
 +      }
 +}
 +
 +/* Push a stripe safely onto the endio list to be handled by do_endios(). */
 +static INLINE void stripe_endio_push(struct stripe *stripe)
 +{
 +      int wake;
 +      unsigned long flags;
 +      struct stripe_cache *sc = stripe->sc;
 +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
 +
 +      spin_lock_irqsave(lock, flags);
 +      wake = list_empty(sc->lists + LIST_ENDIO);
 +      stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
 +      spin_unlock_irqrestore(lock, flags);
 +
 +      if (wake)
 +              wake_do_raid(RS(sc));
 +}
 +
 +/* Protected check for stripe cache endio list empty. */
 +static INLINE int stripe_endio_empty(struct stripe_cache *sc)
 +{
 +      int r;
 +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
 +
 +      spin_lock_irq(lock);
 +      r = list_empty(sc->lists + LIST_ENDIO);
 +      spin_unlock_irq(lock);
 +
 +      return r;
 +}
 +
 +/* Pop a stripe off safely off the endio list. */
 +static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
 +{
 +      struct stripe *stripe;
 +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
 +
 +      /* This runs in parallel with endio(). */
 +      spin_lock_irq(lock);
 +      POP(LIST_ENDIO)
 +      spin_unlock_irq(lock);
 +      return stripe;
 +}
 +
 +#undef POP
 +
 +/* Evict stripe from cache. */
 +static void stripe_evict(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
 +
 +      if (list_empty(stripe->lists + LIST_LRU)) {
 +              stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
 +              atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
 +      }
 +}
 +
 +/* Grow stripe cache. */
 +static int
 +sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
 +{
 +      int r = 0;
 +      struct raid_set *rs = RS(sc);
 +
 +      /* Try to allocate this many (additional) stripes. */
 +      while (stripes--) {
 +              struct stripe *stripe =
 +                      stripe_alloc(sc, sc->mem_cache_client, grow);
 +
 +              if (likely(stripe)) {
 +                      stripe->io.size = rs->set.io_size;
 +                      stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
 +                      atomic_inc(&sc->stripes);
 +              } else {
 +                      r = -ENOMEM;
 +                      break;
 +              }
 +      }
 +
 +      ClearRSScBusy(rs);
 +      return r ? r : hash_resize(sc);
 +}
 +
 +/* Shrink stripe cache. */
 +static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
 +{
 +      int r = 0;
 +
 +      /* Try to get unused stripe from LRU list. */
 +      while (stripes--) {
 +              struct stripe *stripe;
 +
 +              stripe = stripe_lru_pop(sc);
 +              if (stripe) {
 +                      /* An lru stripe may never have ios pending! */
 +                      BUG_ON(stripe_io(stripe));
 +                      stripe_free(stripe, sc->mem_cache_client);
 +                      atomic_dec(&sc->stripes);
 +              } else {
 +                      r = -ENOENT;
 +                      break;
 +              }
 +      }
 +
 +      /* Check if stats are still sane. */
 +      if (atomic_read(&sc->max_active_stripes) >
 +          atomic_read(&sc->stripes))
 +              atomic_set(&sc->max_active_stripes, 0);
 +
 +      if (r)
 +              return r;
 +
 +      ClearRSScBusy(RS(sc));
 +      return hash_resize(sc);
 +}
 +
 +/* Create stripe cache. */
 +static int sc_init(struct raid_set *rs, unsigned stripes)
 +{
 +      unsigned i, nr;
 +      struct stripe_cache *sc = &rs->sc;
 +      struct stripe *stripe;
 +      struct recover *rec = &rs->recover;
 +
 +      /* Initialize lists and locks. */
 +      i = ARRAY_SIZE(sc->lists);
 +      while (i--)
 +              INIT_LIST_HEAD(sc->lists + i);
 +
 +      i = NR_LOCKS;
 +      while (i--)
 +              spin_lock_init(sc->locks + i);
 +
 +      /* Initialize atomic variables. */
 +      atomic_set(&sc->stripes, 0);
 +      atomic_set(&sc->stripes_last, 0);
 +      atomic_set(&sc->stripes_to_shrink, 0);
 +      atomic_set(&sc->active_stripes, 0);
 +      atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
 +
 +      /*
 +       * We need a runtime unique # to suffix the kmem cache name
 +       * because we'll have one for each active RAID set.
 +       */
 +      nr = atomic_inc_return(&_stripe_sc_nr);
 +      sprintf(sc->kc.name, "%s_%d", TARGET, nr);
 +      sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
 +                                       0, 0, NULL);
 +      if (!sc->kc.cache)
 +              return -ENOMEM;
 +
 +      /* Create memory cache client context for RAID stripe cache. */
 +      sc->mem_cache_client =
 +              dm_mem_cache_client_create(stripes, rs->set.raid_devs,
 +                                         chunk_pages(rs->set.io_size));
 +      if (IS_ERR(sc->mem_cache_client))
 +              return PTR_ERR(sc->mem_cache_client);
 +
 +      /* Create memory cache client context for RAID recovery stripe(s). */
 +      rec->mem_cache_client =
 +              dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
 +                                         chunk_pages(rec->io_size));
 +      if (IS_ERR(rec->mem_cache_client))
 +              return PTR_ERR(rec->mem_cache_client);
 +
 +      /* Allocate stripe for set recovery. */
 +      /* XXX: cope with MAX_RECOVERY. */
 +      INIT_LIST_HEAD(&rec->stripes);
 +      for (i = 0; i < MAX_RECOVER; i++) {
 +              stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
 +              if (!stripe)
 +                      return -ENOMEM;
 +
 +              SetStripeRecover(stripe);
 +              stripe->io.size = rec->io_size;
 +              list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
 +      }
 +
 +      /*
 +       * Allocate the stripe objetcs from the
 +       * cache and add them to the LRU list.
 +       */
 +      return sc_grow(sc, stripes, SC_KEEP);
 +}
 +
 +/* Destroy the stripe cache. */
 +static void sc_exit(struct stripe_cache *sc)
 +{
 +      if (sc->kc.cache) {
 +              BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
 +              kmem_cache_destroy(sc->kc.cache);
 +      }
 +
 +      if (sc->mem_cache_client)
 +              dm_mem_cache_client_destroy(sc->mem_cache_client);
 +
 +      ClearRSRecover(RS(sc));
 +      stripe_recover_free(RS(sc));
 +      if (RS(sc)->recover.mem_cache_client)
 +              dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
 +
 +      hash_exit(&sc->hash);
 +}
 +
 +/*
 + * Calculate RAID address
 + *
 + * Delivers tuple with the index of the data disk holding the chunk
 + * in the set, the parity disks index and the start of the stripe
 + * within the address space of the set (used as the stripe cache hash key).
 + */
 +/* thx MD. */
 +static struct address *
 +raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
 +{
 +      unsigned data_devs = rs->set.data_devs, di, pi,
 +               raid_devs = rs->set.raid_devs;
 +      sector_t stripe, tmp;
 +
 +      /*
 +       * chunk_number = sector / chunk_size
 +       * stripe = chunk_number / data_devs
 +       * di = stripe % data_devs;
 +       */
 +      stripe = sector >> rs->set.chunk_shift;
 +      di = sector_div(stripe, data_devs);
 +
 +      switch (rs->set.raid_type->level) {
 +      case raid5:
 +              tmp = stripe;
 +              pi = sector_div(tmp, raid_devs);
 +
 +              switch (rs->set.raid_type->algorithm) {
 +              case left_asym:         /* Left asymmetric. */
 +                      pi = data_devs - pi;
 +              case right_asym:        /* Right asymmetric. */
 +                      if (di >= pi)
 +                              di++;
 +                      break;
 +
 +              case left_sym:          /* Left symmetric. */
 +                      pi = data_devs - pi;
 +              case right_sym:         /* Right symmetric. */
 +                      di = (pi + di + 1) % raid_devs;
 +                      break;
 +
 +              default:
 +                      DMERR("Unknown RAID algorithm %d",
 +                            rs->set.raid_type->algorithm);
 +                      goto out;
 +              }
 +
 +              break;
 +
 +      case raid4:
 +              pi = rs->set.pi;
 +              if (di >= pi)
 +                      di++;
 +              break;
 +
 +      default:
 +              DMERR("Unknown RAID level %d", rs->set.raid_type->level);
 +              goto out;
 +      }
 +
 +      /*
 +       * Hash key = start offset on any single device of the RAID set;
 +       * adjusted in case io size differs from chunk size.
 +       */
 +      addr->key = (stripe << rs->set.chunk_shift) +
 +                  (sector & rs->set.io_shift_mask);
 +      addr->di = di;
 +      addr->pi = pi;
 +
 +out:
 +      return addr;
 +}
 +
 +/*
 + * Copy data across between stripe pages and bio vectors.
 + *
 + * Pay attention to data alignment in stripe and bio pages.
 + */
 +static void
 +bio_copy_page_list(int rw, struct stripe *stripe,
 +                 struct page_list *pl, struct bio *bio)
 +{
 +      unsigned i, page_offset;
 +      void *page_addr;
 +      struct raid_set *rs = RS(stripe->sc);
 +      struct bio_vec *bv;
 +
 +      /* Get start page in page list for this sector. */
 +      i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
 +      pl = pl_elem(pl, i);
 +
 +      page_addr = page_address(pl->page);
 +      page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
 +
 +      /* Walk all segments and copy data across between bio_vecs and pages. */
 +      bio_for_each_segment(bv, bio, i) {
 +              int len = bv->bv_len, size;
 +              unsigned bio_offset = 0;
 +              void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
 +redo:
 +              size = (page_offset + len > PAGE_SIZE) ?
 +                     PAGE_SIZE - page_offset : len;
 +
 +              if (rw == READ)
 +                      memcpy(bio_addr + bio_offset,
 +                             page_addr + page_offset, size);
 +              else
 +                      memcpy(page_addr + page_offset,
 +                             bio_addr + bio_offset, size);
 +
 +              page_offset += size;
 +              if (page_offset == PAGE_SIZE) {
 +                      /*
 +                       * We reached the end of the chunk page ->
 +                       * need refer to the next one to copy more data.
 +                       */
 +                      len -= size;
 +                      if (len) {
 +                              /* Get next page. */
 +                              pl = pl->next;
 +                              BUG_ON(!pl);
 +                              page_addr = page_address(pl->page);
 +                              page_offset = 0;
 +                              bio_offset += size;
 +                              /* REMOVEME: statistics. */
 +                              atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
 +                              goto redo;
 +                      }
 +              }
 +
 +              __bio_kunmap_atomic(bio_addr, KM_USER0);
 +      }
 +}
 +
 +/*
 + * Xor optimization macros.
 + */
 +/* Xor data pointer declaration and initialization macros. */
 +#define DECLARE_2     unsigned long *d0 = data[0], *d1 = data[1]
 +#define DECLARE_3     DECLARE_2, *d2 = data[2]
 +#define DECLARE_4     DECLARE_3, *d3 = data[3]
 +#define DECLARE_5     DECLARE_4, *d4 = data[4]
 +#define DECLARE_6     DECLARE_5, *d5 = data[5]
 +#define DECLARE_7     DECLARE_6, *d6 = data[6]
 +#define DECLARE_8     DECLARE_7, *d7 = data[7]
 +
 +/* Xor unrole macros. */
 +#define D2(n) d0[n] = d0[n] ^ d1[n]
 +#define D3(n) D2(n) ^ d2[n]
 +#define D4(n) D3(n) ^ d3[n]
 +#define D5(n) D4(n) ^ d4[n]
 +#define D6(n) D5(n) ^ d5[n]
 +#define D7(n) D6(n) ^ d6[n]
 +#define D8(n) D7(n) ^ d7[n]
 +
 +#define       X_2(macro, offset)      macro(offset); macro(offset + 1);
 +#define       X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
 +#define       X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
 +#define       X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
 +#define       X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
 +#define       X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
 +
 +/* Define a _xor_#chunks_#xors_per_run() function. */
 +#define       _XOR(chunks, xors_per_run) \
 +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
 +{ \
 +      unsigned end = XOR_SIZE / sizeof(data[0]), i; \
 +      DECLARE_ ## chunks; \
 +\
 +      for (i = 0; i < end; i += xors_per_run) { \
 +              X_ ## xors_per_run(D ## chunks, i); \
 +      } \
 +}
 +
 +/* Define xor functions for 2 - 8 chunks. */
 +#define       MAKE_XOR_PER_RUN(xors_per_run) \
 +      _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
 +      _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
 +      _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
 +      _XOR(8, xors_per_run);
 +
 +MAKE_XOR_PER_RUN(8)   /* Define _xor_*_8() functions. */
 +MAKE_XOR_PER_RUN(16)  /* Define _xor_*_16() functions. */
 +MAKE_XOR_PER_RUN(32)  /* Define _xor_*_32() functions. */
 +MAKE_XOR_PER_RUN(64)  /* Define _xor_*_64() functions. */
 +
 +#define MAKE_XOR(xors_per_run) \
 +struct { \
 +      void (*f)(unsigned long **); \
 +} static xor_funcs ## xors_per_run[] = { \
 +      { NULL }, \
 +      { NULL }, \
 +      { _xor2_ ## xors_per_run }, \
 +      { _xor3_ ## xors_per_run }, \
 +      { _xor4_ ## xors_per_run }, \
 +      { _xor5_ ## xors_per_run }, \
 +      { _xor6_ ## xors_per_run }, \
 +      { _xor7_ ## xors_per_run }, \
 +      { _xor8_ ## xors_per_run }, \
 +}; \
 +\
 +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
 +{ \
 +      /* Call respective function for amount of chunks. */ \
 +      xor_funcs ## xors_per_run[n].f(data); \
 +}
 +
 +/* Define xor_8() - xor_64 functions. */
 +MAKE_XOR(8)
 +MAKE_XOR(16)
 +MAKE_XOR(32)
 +MAKE_XOR(64)
 +
 +/* Maximum number of chunks, which can be xor'ed in one go. */
 +#define       XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
 +
 +struct xor_func {
 +      xor_function_t f;
 +      const char *name;
 +} static xor_funcs[] = {
 +      {xor_8,   "xor_8"},
 +      {xor_16,  "xor_16"},
 +      {xor_32,  "xor_32"},
 +      {xor_64,  "xor_64"},
 +};
 +
 +/*
 + * Calculate crc.
 + *
 + * This indexes into the page list of the stripe.
 + *
 + * All chunks will be xored into the parity chunk
 + * in maximum groups of xor.chunks.
 + *
 + * FIXME: try mapping the pages on discontiguous memory.
 + */
 +static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned max_chunks = rs->xor.chunks, n, p;
 +      unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
 +      unsigned long **d = rs->data;
 +      xor_function_t xor_f = rs->xor.f->f;
 +
 +      /* Address of parity page to xor into. */
 +      d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
 +
 +      /* Preset pointers to data pages. */
 +      for (n = 1, p = rs->set.raid_devs; p--; ) {
 +              if (p != pi && PageIO(PAGE(stripe, p)))
 +                      d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
 +
 +              /* If max chunks -> xor .*/
 +              if (n == max_chunks) {
 +                      xor_f(n, d);
 +                      n = 1;
 +              }
 +      }
 +
 +      /* If chunks -> xor. */
 +      if (n > 1)
 +              xor_f(n, d);
 +
 +      /* Set parity page uptodate and clean. */
 +      page_set(PAGE(stripe, pi), CLEAN);
 +}
 +
 +/* Common xor loop through all stripe page lists. */
 +static void common_xor(struct stripe *stripe, sector_t count,
 +                     unsigned off, unsigned p)
 +{
 +      unsigned sector;
 +
 +      for (sector = off; sector < count; sector += SECTORS_PER_XOR)
 +              xor(stripe, p, sector);
 +
 +      atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
 +}
 +
 +/*
 + * Calculate parity sectors on intact stripes.
 + *
 + * Need to calculate raid address for recover stripe, because its
 + * chunk sizes differs and is typically larger than io chunk size.
 + */
 +static void parity_xor(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned chunk_size = rs->set.chunk_size,
 +               io_size = stripe->io.size,
 +               xor_size = chunk_size > io_size ? io_size : chunk_size;
 +      sector_t off;
 +
 +      /* This can be the recover stripe with a larger io size. */
 +      for (off = 0; off < io_size; off += xor_size) {
 +              unsigned pi;
 +
 +              /*
 +               * Recover stripe likely is bigger than regular io
 +               * ones and has no precalculated parity disk index ->
 +               * need to calculate RAID address.
 +               */
 +              if (unlikely(StripeRecover(stripe))) {
 +                      struct address addr;
 +
 +                      raid_address(rs,
 +                                   (stripe->key + off) * rs->set.data_devs,
 +                                   &addr);
 +                      pi = addr.pi;
 +                      stripe_zero_pl_part(stripe, pi, off,
 +                                          rs->set.chunk_size);
 +              } else
 +                      pi = stripe->idx.parity;
 +
 +              common_xor(stripe, xor_size, off, pi);
 +              page_set(PAGE(stripe, pi), DIRTY);
 +      }
 +}
 +
 +/* Reconstruct missing chunk. */
 +static void reconstruct_xor(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      int p = stripe->idx.recover;
 +
 +      BUG_ON(p < 0);
 +
 +      /* REMOVEME: statistics. */
 +      atomic_inc(rs->stats + (raid_set_degraded(rs) ?
 +                  S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
 +
 +      /* Zero chunk to be reconstructed. */
 +      stripe_zero_chunk(stripe, p);
 +      common_xor(stripe, stripe->io.size, 0, p);
 +}
 +
 +/*
 + * Try getting a stripe either from the hash or from the lru list
 + */
 +static inline void _stripe_get(struct stripe *stripe)
 +{
 +      atomic_inc(&stripe->cnt);
 +}
 +
 +static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
 +{
 +      struct stripe_cache *sc = &rs->sc;
 +      struct stripe *stripe;
 +
 +      stripe = stripe_lookup(sc, addr->key);
 +      if (stripe) {
 +              _stripe_get(stripe);
 +              /* Remove from the lru list if on. */
 +              stripe_lru_del(stripe, LIST_LOCKED);
 +              atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
 +      } else {
 +              /* Second try to get an LRU stripe. */
 +              stripe = stripe_lru_pop(sc);
 +              if (stripe) {
 +                      _stripe_get(stripe);
 +                      /* Invalidate before reinserting with changed key. */
 +                      stripe_invalidate(stripe);
 +                      stripe->key = addr->key;
 +                      stripe->region = dm_rh_sector_to_region(rs->recover.rh,
 +                                                              addr->key);
 +                      stripe->idx.parity = addr->pi;
 +                      sc_insert(sc, stripe);
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_INSCACHE);
 +              }
 +      }
 +
 +      return stripe;
 +}
 +
 +/*
 + * Decrement reference count on a stripe.
 + *
 + * Move it to list of LRU stripes if zero.
 + */
 +static void stripe_put(struct stripe *stripe)
 +{
 +      if (atomic_dec_and_test(&stripe->cnt)) {
 +              if (TestClearStripeActive(stripe))
 +                      atomic_dec(&stripe->sc->active_stripes);
 +
 +              /* Put stripe onto the LRU list. */
 +              stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
 +      }
 +
 +      BUG_ON(atomic_read(&stripe->cnt) < 0);
 +}
 +
 +/*
 + * Process end io
 + *
 + * I need to do it here because I can't in interrupt
 + *
 + * Read and write functions are split in order to avoid
 + * conditionals in the main loop for performamce reasons.
 + */
 +
 +/* Helper read bios on a page list. */
 +static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
 +                              struct bio *bio)
 +{
 +      bio_copy_page_list(READ, stripe, pl, bio);
 +}
 +
 +/* Helper write bios on a page list. */
 +static void _rh_dec(struct stripe *stripe, struct page_list *pl,
 +                  struct bio *bio)
 +{
 +      dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
 +}
 +
 +/* End io all bios on a page list. */
 +static inline int
 +page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
 +{
 +      int r = 0;
 +      struct bio_list *bl = BL(stripe, p, rw);
 +
 +      if (!bio_list_empty(bl)) {
 +              struct page_list *pl = PL(stripe, p);
 +              struct page *page = pl->page;
 +
 +              if (PageLocked(page))
 +                      r = -EBUSY;
 +              /*
 +               * FIXME: PageUptodate() not cleared
 +               *        properly for missing chunks ?
 +               */
 +              else if (PageUptodate(page)) {
 +                      struct bio *bio;
 +                      struct raid_set *rs = RS(stripe->sc);
 +                      void (*h_f)(struct stripe *, struct page_list *,
 +                                  struct bio *) =
 +                              (rw == READ) ? _bio_copy_page_list : _rh_dec;
 +
 +                      while ((bio = bio_list_pop(bl))) {
 +                              h_f(stripe, pl, bio);
 +                              _bio_endio(rs, bio, 0);
 +                              stripe_put(stripe);
 +                              if (count)
 +                                      (*count)++;
 +                      }
 +              } else
 +                      r = -EAGAIN;
 +      }
 +
 +      return r;
 +}
 +
 +/*
 + * End io all reads/writes on a stripe copying
 + * read date accross from stripe to bios.
 + */
 +static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
 +{
 +      int r = 0;
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--) {
 +              int rr = page_list_endio(rw, stripe, p, count);
 +
 +              if (rr && r != -EIO)
 +                      r = rr;
 +      }
 +
 +      return r;
 +}
 +
 +/* Fail all ios on a bio list and return # of bios. */
 +static unsigned
 +bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
 +{
 +      unsigned r;
 +      struct bio *bio;
 +
 +      raid_set_dead(rs);
 +
 +      /* Update region counters. */
 +      if (stripe) {
 +              struct dm_region_hash *rh = rs->recover.rh;
 +
 +              bio_list_for_each(bio, bl) {
 +                      if (bio_data_dir(bio) == WRITE)
 +                              dm_rh_dec(rh, stripe->region);
 +              }
 +      }
 +
 +      /* Error end io all bios. */
 +      for (r = 0; (bio = bio_list_pop(bl)); r++)
 +              _bio_endio(rs, bio, -EIO);
 +
 +      return r;
 +}
 +
 +/* Fail all ios of a bio list of a stripe and drop io pending count. */
 +static void
 +stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
 +                   struct bio_list *bl)
 +{
 +      unsigned put = bio_list_fail(rs, stripe, bl);
 +
 +      while (put--)
 +              stripe_put(stripe);
 +}
 +
 +/* Fail all ios hanging off all bio lists of a stripe. */
 +static void stripe_fail_io(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned p = rs->set.raid_devs;
 +
 +      stripe_evict(stripe);
 +
 +      while (p--) {
 +              struct stripe_set *ss = stripe->ss + p;
 +              int i = ARRAY_SIZE(ss->bl);
 +
 +              while (i--)
 +                      stripe_bio_list_fail(rs, stripe, ss->bl + i);
 +      }
 +}
 +
 +/*
 + * Handle all stripes by handing them to the daemon, because we can't
 + * map their pages to copy the data in interrupt context.
 + *
 + * We don't want to handle them here either, while interrupts are disabled.
 + */
 +
 +/* Read/write endio function for dm-io (interrupt context). */
 +static void endio(unsigned long error, void *context)
 +{
 +      struct dm_mem_cache_object *obj = context;
 +      struct stripe_set *ss = obj->private;
 +      struct stripe *stripe = ss->stripe;
 +      struct page *page = obj->pl->page;
 +
 +      if (unlikely(error))
 +              stripe_error(stripe, page);
 +      else
 +              page_set(page, CLEAN);
 +
 +      __clear_page_locked(page);
 +      stripe_io_dec(stripe);
 +
 +      /* Add stripe to endio list and wake daemon. */
 +      stripe_endio_push(stripe);
 +}
 +
 +/*
 + * Recovery io throttling
 + */
 +/* Conditionally reset io counters. */
 +enum count_type { IO_WORK = 0, IO_RECOVER };
 +static int recover_io_reset(struct raid_set *rs)
 +{
 +      unsigned long j = jiffies;
 +
 +      /* Pay attention to jiffies overflows. */
 +      if (j > rs->recover.last_jiffies + HZ
 +          || j < rs->recover.last_jiffies) {
 +              rs->recover.last_jiffies = j;
 +              atomic_set(rs->recover.io_count + IO_WORK, 0);
 +              atomic_set(rs->recover.io_count + IO_RECOVER, 0);
 +              return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Count ios. */
 +static INLINE void
 +recover_io_count(struct raid_set *rs, struct stripe *stripe)
 +{
 +      if (RSRecover(rs)) {
 +              recover_io_reset(rs);
 +              atomic_inc(rs->recover.io_count +
 +                         (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
 +      }
 +}
 +
 +/* Read/Write a page_list asynchronously. */
 +static void page_list_rw(struct stripe *stripe, unsigned p)
 +{
 +      struct stripe_cache *sc = stripe->sc;
 +      struct raid_set *rs = RS(sc);
 +      struct dm_mem_cache_object *obj = stripe->obj + p;
 +      struct page_list *pl = obj->pl;
 +      struct page *page = pl->page;
 +      struct raid_dev *dev = rs->dev + p;
 +      struct dm_io_region io = {
 +              .bdev = dev->dev->bdev,
 +              .sector = stripe->key,
 +              .count = stripe->io.size,
 +      };
 +      struct dm_io_request control = {
 +              .bi_rw = PageDirty(page) ? WRITE : READ,
 +              .mem.type = DM_IO_PAGE_LIST,
 +              .mem.ptr.pl = pl,
 +              .mem.offset = 0,
 +              .notify.fn = endio,
 +              .notify.context = obj,
 +              .client = sc->dm_io_client,
 +      };
 +
 +      BUG_ON(PageLocked(page));
 +
 +      /*
 +       * Don't rw past end of device, which can happen, because
 +       * typically sectors_per_dev isn't divisable by io_size.
 +       */
 +      if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
 +              io.count = rs->set.sectors_per_dev - io.sector;
 +
 +      io.sector += dev->start;        /* Add <offset>. */
 +      recover_io_count(rs, stripe);   /* Recovery io accounting. */
 +
 +      /* REMOVEME: statistics. */
 +      atomic_inc(rs->stats +
 +                  (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
 +
 +      ClearPageError(page);
 +      __set_page_locked(page);
 +      io_dev_queued(dev);
 +      BUG_ON(dm_io(&control, 1, &io, NULL));
 +}
 +
 +/*
 + * Write dirty / read not uptodate page lists of a stripe.
 + */
 +static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
 +{
 +      unsigned r;
 +
 +      /*
 +       * Increment the pending count on the stripe
 +       * first, so that we don't race in endio().
 +       *
 +       * An inc (IO) is needed for any page:
 +       *
 +       * o not uptodate
 +       * o dirtied by writes merged
 +       * o dirtied by parity calculations
 +       */
 +      r = for_each_io_dev(rs, stripe, _stripe_io_inc);
 +      if (r) {
 +              /* io needed: chunks are not uptodate/dirty. */
 +              int max;        /* REMOVEME: */
 +              struct stripe_cache *sc = &rs->sc;
 +
 +              if (!TestSetStripeActive(stripe))
 +                      atomic_inc(&sc->active_stripes);
 +
 +              /* Take off the lru list in case it got added there. */
 +              stripe_lru_del(stripe, LIST_LOCKED);
 +
 +              /* Submit actual io. */
 +              for_each_io_dev(rs, stripe, page_list_rw);
 +
 +              /* REMOVEME: statistics */
 +              max = sc_active(sc);
 +              if (atomic_read(&sc->max_active_stripes) < max)
 +                      atomic_set(&sc->max_active_stripes, max);
 +
 +              atomic_inc(rs->stats + S_FLUSHS);
 +              /* END REMOVEME: statistics */
 +      }
 +
 +      return r;
 +}
 +
 +/* Work in all pending writes. */
 +static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
 +{
 +      struct bio_list *write = BL(stripe, p, WRITE);
 +
 +      if (!bio_list_empty(write)) {
 +              struct page_list *pl = stripe->obj[p].pl;
 +              struct bio *bio;
 +              struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
 +
 +              /*
 +               * We can play with the lists without holding a lock,
 +               * because it is just us accessing them anyway.
 +               */
 +              bio_list_for_each(bio, write)
 +                      bio_copy_page_list(WRITE, stripe, pl, bio);
 +
 +              bio_list_merge(write_merged, write);
 +              bio_list_init(write);
 +              page_set(pl->page, DIRTY);
 +      }
 +}
 +
 +/* Merge in all writes hence dirtying respective pages. */
 +static INLINE void writes_merge(struct stripe *stripe)
 +{
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--)
 +              _writes_merge(stripe, p);
 +}
 +
 +/* Check, if a chunk gets completely overwritten. */
 +static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
 +{
 +      unsigned sectors = 0;
 +      struct bio *bio;
 +      struct bio_list *bl = BL(stripe, p, WRITE);
 +
 +      bio_list_for_each(bio, bl)
 +              sectors += bio_sectors(bio);
 +
 +      return sectors == RS(stripe->sc)->set.io_size;
 +}
 +
 +/*
 + * Prepare stripe to avoid io on broken/reconstructed
 + * drive in order to reconstruct date on endio.
 + */
 +enum prepare_type { IO_ALLOW, IO_PROHIBIT };
 +static void stripe_prepare(struct stripe *stripe, unsigned p,
 +                         enum prepare_type type)
 +{
 +      struct page *page = PAGE(stripe, p);
 +
 +      switch (type) {
 +      case IO_PROHIBIT:
 +              /*
 +               * In case we prohibit, we gotta make sure, that
 +               * io on all other chunks than the one which failed
 +               * or is being reconstructed is allowed and that it
 +               * doesn't have state uptodate.
 +               */
 +              stripe_allow_io(stripe);
 +              ClearPageUptodate(page);
 +              ProhibitPageIO(page);
 +
 +              /* REMOVEME: statistics. */
 +              atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
 +              stripe->idx.recover = p;
 +              SetStripeReconstruct(stripe);
 +              break;
 +
 +      case IO_ALLOW:
 +              AllowPageIO(page);
 +              stripe->idx.recover = -1;
 +              ClearStripeReconstruct(stripe);
 +              break;
 +
 +      default:
 +              BUG();
 +      }
 +}
 +
 +/*
 + * Degraded/reconstruction mode.
 + *
 + * Check stripe state to figure which chunks don't need IO.
 + */
 +static INLINE void stripe_check_reconstruct(struct stripe *stripe,
 +                                          int prohibited)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +
 +      /*
 +       * Degraded mode (device(s) failed) ->
 +       * avoid io on the failed device.
 +       */
 +      if (unlikely(raid_set_degraded(rs))) {
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + S_DEGRADED);
 +              stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
 +              return;
 +      } else {
 +              /*
 +               * Reconstruction mode (ie. a particular device or
 +               * some (rotating) parity chunk is being resynchronized) ->
 +               *   o make sure all needed pages are read in
 +               *   o writes are allowed to go through
 +               */
 +              int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
 +
 +              if (r) {
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_NOSYNC);
 +                      stripe_prepare(stripe, dev_for_parity(stripe),
 +                                     IO_PROHIBIT);
 +                      return;
 +              }
 +      }
 +
 +      /*
 +       * All disks good. Avoid reading parity chunk and reconstruct it
 +       * unless we have prohibited io to chunk(s).
 +       */
 +      if (!prohibited) {
 +              if (StripeMerged(stripe))
 +                      stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
 +              else {
 +                      stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
 +
 +                      /*
 +                       * Overrule stripe_prepare to reconstruct the
 +                       * parity chunk, because it'll be created new anyway.
 +                       */
 +                      ClearStripeReconstruct(stripe);
 +              }
 +      }
 +}
 +
 +/* Check, if stripe is ready to merge writes. */
 +static INLINE int stripe_check_merge(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      int prohibited = 0;
 +      unsigned chunks = 0, p = rs->set.raid_devs;
 +
 +      /* Walk all chunks. */
 +      while (p--) {
 +              struct page *page = PAGE(stripe, p);
 +
 +              /* Can't merge active chunks. */
 +              if (PageLocked(page)) {
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
 +                      break;
 +              }
 +
 +              /* Can merge uptodate chunks and have to count parity chunk. */
 +              if (PageUptodate(page) || p == stripe->idx.parity) {
 +                      chunks++;
 +                      continue;
 +              }
 +
 +              /* Read before write ordering. */
 +              if (RSCheckOverwrite(rs) &&
 +                  bio_list_empty(BL(stripe, p, READ))) {
 +                      int r = stripe_check_overwrite(stripe, p);
 +
 +                      if (r) {
 +                              chunks++;
 +                              /* REMOVEME: statistics. */
 +                              atomic_inc(RS(stripe->sc)->stats +
 +                                         S_PROHIBITPAGEIO);
 +                              ProhibitPageIO(page);
 +                              prohibited = 1;
 +                      }
 +              }
 +      }
 +
 +      if (chunks == rs->set.raid_devs) {
 +              /* All pages are uptodate or get written over or mixture. */
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + S_CAN_MERGE);
 +              return 0;
 +      } else
 +              /* REMOVEME: statistics.*/
 +              atomic_inc(rs->stats + S_CANT_MERGE);
 +
 +      return prohibited ? 1 : -EPERM;
 +}
 +
 +/* Check, if stripe is ready to merge writes. */
 +static INLINE int stripe_check_read(struct stripe *stripe)
 +{
 +      int r = 0;
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      /* Walk all chunks. */
 +      while (p--) {
 +              struct page *page = PAGE(stripe, p);
 +
 +              if (!PageLocked(page) &&
 +                  bio_list_empty(BL(stripe, p, READ))) {
 +                      ProhibitPageIO(page);
 +                      r = 1;
 +              }
 +      }
 +
 +      return r;
 +}
 +
 +/*
 + * Read/write a stripe.
 + *
 + * All stripe read/write activity goes through this function.
 + *
 + * States to cover:
 + *   o stripe to read and/or write
 + *   o stripe with error to reconstruct
 + */
 +static int stripe_rw(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      int prohibited = 0, r;
 +
 +      /*
 +       * Check the state of the RAID set and if degraded (or
 +       * resynchronizing for reads), read in all other chunks but
 +       * the one on the dead/resynchronizing device in order to be
 +       * able to reconstruct the missing one.
 +       *
 +       * Merge all writes hanging off uptodate pages of the stripe.
 +       */
 +
 +      /* Initially allow io on all chunks and prohibit below, if necessary. */
 +      stripe_allow_io(stripe);
 +
 +      if (StripeRBW(stripe)) {
 +              r = stripe_check_merge(stripe);
 +              if (!r) {
 +                      /*
 +                       * If I could rely on valid parity (which would only
 +                       * be sure in case of a full synchronization),
 +                       * I could xor a fraction of chunks out of
 +                       * parity and back in.
 +                       *
 +                       * For the time being, I got to redo parity...
 +                       */
 +                      /* parity_xor(stripe); */       /* Xor chunks out. */
 +                      stripe_zero_chunk(stripe, stripe->idx.parity);
 +                      writes_merge(stripe);           /* Merge writes in. */
 +                      parity_xor(stripe);             /* Update parity. */
 +                      ClearStripeRBW(stripe);         /* Disable RBW. */
 +                      SetStripeMerged(stripe);        /* Writes merged. */
 +              }
 +
 +              if (r > 0)
 +                      prohibited = 1;
 +      } else if (!raid_set_degraded(rs))
 +              /* Only allow for read avoidance if not degraded. */
 +              prohibited = stripe_check_read(stripe);
 +
 +      /*
 +       * Check, if io needs to be allowed/prohibeted on certain chunks
 +       * because of a degraded set or reconstruction on a region.
 +       */
 +      stripe_check_reconstruct(stripe, prohibited);
 +
 +      /* Now submit any reads/writes. */
 +      r = stripe_page_lists_rw(rs, stripe);
 +      if (!r) {
 +              /*
 +               * No io submitted because of chunk io prohibited or
 +               * locked pages -> push to end io list for processing.
 +               */
 +              atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
 +              stripe_endio_push(stripe);
 +              wake_do_raid(rs);       /* Wake myself. */
 +      }
 +
 +      return 0;
 +}
 +
 +/* Flush stripe either via flush list or imeediately. */
 +enum flush_type { FLUSH_DELAY, FLUSH_NOW };
 +static int stripe_flush(struct stripe *stripe, enum flush_type type)
 +{
 +      int r = 0;
 +
 +      stripe_lru_del(stripe, LIST_LOCKED);
 +
 +      /* Immediately flush. */
 +      if (type == FLUSH_NOW) {
 +              if (likely(raid_set_operational(RS(stripe->sc))))
 +                      r = stripe_rw(stripe); /* Read/write stripe. */
 +              else
 +                      /* Optimization: Fail early on failed sets. */
 +                      stripe_fail_io(stripe);
 +      /* Delay flush by putting it on io list for later processing. */
 +      } else if (type == FLUSH_DELAY)
 +              stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
 +      else
 +              BUG();
 +
 +      return r;
 +}
 +
 +/*
 + * Queue reads and writes to a stripe by hanging
 + * their bios off the stripsets read/write lists.
 + *
 + * Endio reads on uptodate chunks.
 + */
 +static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
 +                                 struct bio_list *reject)
 +{
 +      int r = 0;
 +      struct address addr;
 +      struct stripe *stripe =
 +              stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
 +
 +      if (stripe) {
 +              int rr, rw = bio_data_dir(bio);
 +
 +              rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
 +              if (rr) {
 +                      stripe_put(stripe);
 +                      goto out;
 +              }
 +
 +              /* Distinguish read and write cases. */
 +              bio_list_add(BL(stripe, addr.di, rw), bio);
 +
 +              /* REMOVEME: statistics */
 +              atomic_inc(rs->stats + (rw == WRITE ?
 +                         S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
 +
 +              if (rw == READ)
 +                      SetStripeRead(stripe);
 +              else {
 +                      SetStripeRBW(stripe);
 +
 +                      /* Inrement pending write count on region. */
 +                      dm_rh_inc(rs->recover.rh, stripe->region);
 +                      r = 1;  /* Region hash needs a flush. */
 +              }
 +
 +              /*
 +               * Optimize stripe flushing:
 +               *
 +               * o directly start io for read stripes.
 +               *
 +               * o put stripe onto stripe caches io_list for RBW,
 +               *   so that do_flush() can belabour it after we put
 +               *   more bios to the stripe for overwrite optimization.
 +               */
 +              stripe_flush(stripe,
 +                           StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
 +
 +      /* Got no stripe from cache -> reject bio. */
 +      } else {
 +out:
 +              bio_list_add(reject, bio);
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + S_IOS_POST);
 +      }
 +
 +      return r;
 +}
 +
 +/*
 + * Recovery functions
 + */
 +/* Read a stripe off a raid set for recovery. */
 +static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
 +{
 +      /* Invalidate all pages so that they get read in. */
 +      stripe_pages_invalidate(stripe);
 +
 +      /* Allow io on all recovery chunks. */
 +      stripe_allow_io(stripe);
 +
 +      if (idx > -1)
 +              ProhibitPageIO(PAGE(stripe, idx));
 +
 +      stripe->key = rs->recover.pos;
 +      return stripe_page_lists_rw(rs, stripe);
 +}
 +
 +/* Write a stripe to a raid set for recovery. */
 +static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
 +{
 +      /*
 +       * If this is a reconstruct of a particular device, then
 +       * reconstruct the respective page(s), else create parity page(s).
 +       */
 +      if (idx > -1) {
 +              struct page *page = PAGE(stripe, idx);
 +
 +              AllowPageIO(page);
 +              stripe_zero_chunk(stripe, idx);
 +              common_xor(stripe, stripe->io.size, 0, idx);
 +              page_set(page, DIRTY);
 +      } else
 +              parity_xor(stripe);
 +
 +      return stripe_page_lists_rw(rs, stripe);
 +}
 +
 +/* Recover bandwidth available ?. */
 +static int recover_bandwidth(struct raid_set *rs)
 +{
 +      int r, work;
 +
 +      /* On reset -> allow recovery. */
 +      r = recover_io_reset(rs);
 +      if (r || RSBandwidth(rs))
 +              goto out;
 +
 +      work = atomic_read(rs->recover.io_count + IO_WORK);
 +      if (work) {
 +              /* Pay attention to larger recover stripe size. */
 +              int recover =
 +                  atomic_read(rs->recover.io_count + IO_RECOVER) *
 +                              rs->recover.io_size /
 +                              rs->set.io_size;
 +
 +              /*
 +               * Don't use more than given bandwidth of
 +               * the work io for recovery.
 +               */
 +              if (recover > work / rs->recover.bandwidth_work) {
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_NO_BANDWIDTH);
 +                      return 0;
 +              }
 +      }
 +
 +out:
 +      atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
 +      return 1;
 +}
 +
 +/* Try to get a region to recover. */
 +static int recover_get_region(struct raid_set *rs)
 +{
 +      struct recover *rec = &rs->recover;
 +      struct dm_region_hash *rh = rec->rh;
 +
 +      /* Start quiescing some regions. */
 +      if (!RSRegionGet(rs)) {
 +              int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
 +
 +              if (r) {
 +                      r = dm_rh_recovery_prepare(rh);
 +                      if (r < 0) {
 +                              DMINFO("No %sregions to recover",
 +                                     rec->nr_regions_to_recover ?
 +                                     "more " : "");
 +                              return -ENOENT;
 +                      }
 +              } else
 +                      return -EAGAIN;
 +
 +              SetRSRegionGet(rs);
 +      }
 +
 +      if (!rec->reg) {
 +              rec->reg = dm_rh_recovery_start(rh);
 +              if (rec->reg) {
 +                      /*
 +                       * A reference for the the region I'll
 +                       * keep till I've completely synced it.
 +                       */
 +                      io_get(rs);
 +                      rec->pos = dm_rh_region_to_sector(rh,
 +                              dm_rh_get_region_key(rec->reg));
 +                      rec->end = rec->pos + dm_rh_get_region_size(rh);
 +                      return 1;
 +              } else
 +                      return -EAGAIN;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Read/write a recovery stripe. */
 +static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
 +{
 +      /* Read/write flip-flop. */
 +      if (TestClearStripeRBW(stripe)) {
 +              SetStripeRead(stripe);
 +              return recover_read(rs, stripe, idx_get(rs));
 +      } else if (TestClearStripeRead(stripe))
 +              return recover_write(rs, stripe, idx_get(rs));
 +
 +      return 0;
 +}
 +
 +/* Reset recovery variables. */
 +static void recovery_region_reset(struct raid_set *rs)
 +{
 +      rs->recover.reg = NULL;
 +      ClearRSRegionGet(rs);
 +}
 +
 +/* Update region hash state. */
 +static void recover_rh_update(struct raid_set *rs, int error)
 +{
 +      struct recover *rec = &rs->recover;
 +      struct dm_region *reg = rec->reg;
 +
 +      if (reg) {
 +              dm_rh_recovery_end(reg, error);
 +              if (!error)
 +                      rec->nr_regions_recovered++;
 +
 +              recovery_region_reset(rs);
 +      }
 +
 +      dm_rh_update_states(reg->rh, 1);
 +      dm_rh_flush(reg->rh);
 +      io_put(rs);     /* Release the io reference for the region. */
 +}
 +
 +/* Called by main io daemon to recover regions. */
 +/* FIXME: cope with MAX_RECOVER > 1. */
 +static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
 +{
 +      int r;
 +      struct recover *rec = &rs->recover;
 +
 +      /* If recovery is active -> return. */
 +      if (StripeActive(stripe))
 +              return;
 +
 +      /* io error is fatal for recovery -> stop it. */
 +      if (unlikely(StripeError(stripe)))
 +              goto err;
 +
 +      /* Get a region to recover. */
 +      r = recover_get_region(rs);
 +      switch (r) {
 +      case 1: /* Got a new region. */
 +              /* Flag read before write. */
 +              ClearStripeRead(stripe);
 +              SetStripeRBW(stripe);
 +              break;
 +
 +      case 0:
 +              /* Got a region in the works. */
 +              r = recover_bandwidth(rs);
 +              if (r) /* Got enough bandwidth. */
 +                      break;
 +
 +      case -EAGAIN:
 +              /* No bandwidth/quiesced region yet, try later. */
 +              wake_do_raid_delayed(rs, HZ / 10);
 +              return;
 +
 +      case -ENOENT:   /* No more regions. */
 +              dm_table_event(rs->ti->table);
 +              goto free;
 +      }
 +
 +      /* Read/write a recover stripe. */
 +      r = recover_stripe_rw(rs, stripe);
 +      if (r) {
 +              /* IO initiated, get another reference for the IO. */
 +              io_get(rs);
 +              return;
 +      }
 +
 +      /* Update recovery position within region. */
 +      rec->pos += stripe->io.size;
 +
 +      /* If we're at end of region, update region hash. */
 +      if (rec->pos >= rec->end ||
 +          rec->pos >= rs->set.sectors_per_dev)
 +              recover_rh_update(rs, 0);
 +      else
 +              SetStripeRBW(stripe);
 +
 +      /* Schedule myself for another round... */
 +      wake_do_raid(rs);
 +      return;
 +
 +err:
 +      raid_set_check_degrade(rs, stripe);
 +
 +      {
 +              char buf[BDEVNAME_SIZE];
 +
 +              DMERR("stopping recovery due to "
 +                    "ERROR on /dev/%s, stripe at offset %llu",
 +                    bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
 +                    (unsigned long long) stripe->key);
 +
 +      }
 +
 +      /* Make sure, that all quiesced regions get released. */
 +      do {
 +              if (rec->reg)
 +                      dm_rh_recovery_end(rec->reg, -EIO);
 +
 +              rec->reg = dm_rh_recovery_start(rec->rh);
 +      } while (rec->reg);
 +
 +      recover_rh_update(rs, -EIO);
 +free:
 +      rs->set.dev_to_init = -1;
 +
 +      /* Check for jiffies overrun. */
 +      rs->recover.end_jiffies = jiffies;
 +      if (rs->recover.end_jiffies < rs->recover.start_jiffies)
 +              rs->recover.end_jiffies = ~0;
 +
 +      ClearRSRecover(rs);
 +}
 +
 +static INLINE void do_recovery(struct raid_set *rs)
 +{
 +      struct stripe *stripe;
 +
 +      list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
 +              _do_recovery(rs, stripe);
 +
 +      if (!RSRecover(rs))
 +              stripe_recover_free(rs);
 +}
 +
 +/*
 + * END recovery functions
 + */
 +
 +/* End io process all stripes handed in by endio() callback. */
 +static void do_endios(struct raid_set *rs)
 +{
 +      struct stripe_cache *sc = &rs->sc;
 +      struct stripe *stripe;
 +
 +      while ((stripe = stripe_endio_pop(sc))) {
 +              unsigned count;
 +
 +              /* Recovery stripe special case. */
 +              if (unlikely(StripeRecover(stripe))) {
 +                      if (stripe_io(stripe))
 +                              continue;
 +
 +                      io_put(rs); /* Release region io reference. */
 +                      ClearStripeActive(stripe);
 +
 +                      /* REMOVEME: statistics*/
 +                      atomic_dec(&sc->active_stripes);
 +                      continue;
 +              }
 +
 +              /* Early end io all reads on any uptodate chunks. */
 +              stripe_endio(READ, stripe, (count = 0, &count));
 +              if (stripe_io(stripe)) {
 +                      if (count) /* REMOVEME: statistics. */
 +                              atomic_inc(rs->stats + S_ACTIVE_READS);
 +
 +                      continue;
 +              }
 +
 +              /* Set stripe inactive after all io got processed. */
 +              if (TestClearStripeActive(stripe))
 +                      atomic_dec(&sc->active_stripes);
 +
 +              /* Unlock stripe (for clustering). */
 +              stripe_unlock(rs, stripe);
 +
 +              /*
 +               * If an io error on a stripe occured and the RAID set
 +               * is still operational, requeue the stripe for io.
 +               */
 +              if (TestClearStripeError(stripe)) {
 +                      raid_set_check_degrade(rs, stripe);
 +                      ClearStripeReconstruct(stripe);
 +
 +                      if (!StripeMerged(stripe) &&
 +                          raid_set_operational(rs)) {
 +                              stripe_pages_invalidate(stripe);
 +                              stripe_flush(stripe, FLUSH_DELAY);
 +                              /* REMOVEME: statistics. */
 +                              atomic_inc(rs->stats + S_REQUEUE);
 +                              continue;
 +                      }
 +              }
 +
 +              /* Check if the RAID set is inoperational to error ios. */
 +              if (!raid_set_operational(rs)) {
 +                      ClearStripeReconstruct(stripe);
 +                      stripe_fail_io(stripe);
 +                      BUG_ON(atomic_read(&stripe->cnt));
 +                      continue;
 +              }
 +
 +              /* Got to reconstruct a missing chunk. */
 +              if (TestClearStripeReconstruct(stripe))
 +                      reconstruct_xor(stripe);
 +
 +              /*
 +               * Now that we've got a complete stripe, we can
 +               * process the rest of the end ios on reads.
 +               */
 +              BUG_ON(stripe_endio(READ, stripe, NULL));
 +              ClearStripeRead(stripe);
 +
 +              /*
 +               * Read-before-write stripes need to be flushed again in
 +               * order to work the write data into the pages *after*
 +               * they were read in.
 +               */
 +              if (TestClearStripeMerged(stripe))
 +                      /* End io all bios which got merged already. */
 +                      BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
 +
 +              /* Got to put on flush list because of new writes. */
 +              if (StripeRBW(stripe))
 +                      stripe_flush(stripe, FLUSH_DELAY);
 +      }
 +}
 +
 +/*
 + * Stripe cache shrinking.
 + */
 +static INLINE void do_sc_shrink(struct raid_set *rs)
 +{
 +      unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
 +
 +      if (shrink) {
 +              unsigned cur = atomic_read(&rs->sc.stripes);
 +
 +              sc_shrink(&rs->sc, shrink);
 +              shrink -= cur - atomic_read(&rs->sc.stripes);
 +              atomic_set(&rs->sc.stripes_to_shrink, shrink);
 +
 +              /*
 +               * Wake myself up in case we failed to shrink the
 +               * requested amount in order to try again later.
 +               */
 +              if (shrink)
 +                      wake_do_raid(rs);
 +      }
 +}
 +
 +
 +/*
 + * Process all ios
 + *
 + * We do different things with the io depending on the
 + * state of the region that it's in:
 + *
 + * o reads: hang off stripe cache or postpone if full
 + *
 + * o writes:
 + *
 + *  CLEAN/DIRTY/NOSYNC:       increment pending and hang io off stripe's stripe set.
 + *                    In case stripe cache is full or busy, postpone the io.
 + *
 + *  RECOVERING:               delay the io until recovery of the region completes.
 + *
 + */
 +static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
 +{
 +      int r;
 +      unsigned flush = 0;
 +      struct dm_region_hash *rh = rs->recover.rh;
 +      struct bio *bio;
 +      struct bio_list delay, reject;
 +
 +      bio_list_init(&delay);
 +      bio_list_init(&reject);
 +
 +      /*
 +       * Classify each io:
 +       *    o delay to recovering regions
 +       *    o queue to all other regions
 +       */
 +      while ((bio = bio_list_pop(ios))) {
 +              /*
 +               * In case we get a barrier bio, push it back onto
 +               * the input queue unless all work queues are empty
 +               * and the stripe cache is inactive.
 +               */
 +              if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_BARRIER);
 +                      if (!list_empty(rs->sc.lists + LIST_IO) ||
 +                          !bio_list_empty(&delay) ||
 +                          !bio_list_empty(&reject) ||
 +                          sc_active(&rs->sc)) {
 +                              bio_list_push(ios, bio);
 +                              break;
 +                      }
 +              }
 +
 +              r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
 +              if (unlikely(r)) {
 +                      /* Got to wait for recovering regions. */
 +                      bio_list_add(&delay, bio);
 +                      SetRSBandwidth(rs);
 +              } else {
 +                      /*
 +                       * Process ios to non-recovering regions by queueing
 +                       * them to stripes (does rh_inc()) for writes).
 +                       */
 +                      flush += stripe_queue_bio(rs, bio, &reject);
 +              }
 +      }
 +
 +      if (flush) {
 +              r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
 +              if (r)
 +                      DMERR("dirty log flush");
 +      }
 +
 +      /* Delay ios to regions which are recovering. */
 +      while ((bio = bio_list_pop(&delay))) {
 +              /* REMOVEME: statistics.*/
 +              atomic_inc(rs->stats + S_DELAYED_BIOS);
 +              atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
 +              dm_rh_delay(rh, bio);
 +
 +      }
 +
 +      /* Merge any rejected bios back to the head of the input list. */
 +      bio_list_merge_head(ios, &reject);
 +}
 +
 +/* Flush any stripes on the io list. */
 +static INLINE void do_flush(struct raid_set *rs)
 +{
 +      struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
 +
 +      list_for_each_safe(pos, tmp, list) {
 +              int r = stripe_flush(list_entry(pos, struct stripe,
 +                                              lists[LIST_IO]), FLUSH_NOW);
 +
 +              /* Remove from the list only if the stripe got processed. */
 +              if (!r)
 +                      list_del_init(pos);
 +      }
 +}
 +
 +/* Send an event in case we're getting too busy. */
 +static INLINE void do_busy_event(struct raid_set *rs)
 +{
 +      if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
 +              if (!TestSetRSScBusy(rs))
 +                      dm_table_event(rs->ti->table);
 +      } else
 +              ClearRSScBusy(rs);
 +}
 +
 +/* Unplug: let the io role on the sets devices. */
 +static INLINE void do_unplug(struct raid_set *rs)
 +{
 +      struct raid_dev *dev = rs->dev + rs->set.raid_devs;
 +
 +      while (dev-- > rs->dev) {
 +              /* Only call any device unplug function, if io got queued. */
 +              if (io_dev_clear(dev))
 +                      blk_unplug(bdev_get_queue(dev->dev->bdev));
 +      }
 +}
 +
 +/*-----------------------------------------------------------------
 + * RAID daemon
 + *---------------------------------------------------------------*/
 +/*
 + * o belabour all end ios
 + * o optionally shrink the stripe cache
 + * o update the region hash states
 + * o optionally do recovery
 + * o grab the input queue
 + * o work an all requeued or new ios and perform stripe cache flushs
 + *   unless the RAID set is inoperational (when we error ios)
 + * o check, if the stripe cache gets too busy and throw an event if so
 + * o unplug any component raid devices with queued bios
 + */
 +static void do_raid(struct work_struct *ws)
 +{
 +      struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
 +      struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
 +      spinlock_t *lock = &rs->io.in_lock;
 +
 +      /*
 +       * We always need to end io, so that ios
 +       * can get errored in case the set failed
 +       * and the region counters get decremented
 +       * before we update the region hash states.
 +       */
 +redo:
 +      do_endios(rs);
 +
 +      /*
 +       * Now that we've end io'd, which may have put stripes on
 +       * the LRU list, we shrink the stripe cache if requested.
 +       */
 +      do_sc_shrink(rs);
 +
 +      /* Update region hash states before we go any further. */
 +      dm_rh_update_states(rs->recover.rh, 1);
 +
 +      /* Try to recover regions. */
 +      if (RSRecover(rs))
 +              do_recovery(rs);
 +
 +      /* More endios -> process. */
 +      if (!stripe_endio_empty(&rs->sc)) {
 +              atomic_inc(rs->stats + S_REDO);
 +              goto redo;
 +      }
 +
 +      /* Quickly grab all new ios queued and add them to the work list. */
 +      spin_lock_irq(lock);
 +      bio_list_merge(ios, ios_in);
 +      bio_list_init(ios_in);
 +      spin_unlock_irq(lock);
 +
 +      /* Let's assume we're operational most of the time ;-). */
 +      if (likely(raid_set_operational(rs))) {
 +              /* If we got ios, work them into the cache. */
 +              if (!bio_list_empty(ios)) {
 +                      do_ios(rs, ios);
 +                      do_unplug(rs);  /* Unplug the sets device queues. */
 +              }
 +
 +              do_flush(rs);           /* Flush any stripes on io list. */
 +              do_unplug(rs);          /* Unplug the sets device queues. */
 +              do_busy_event(rs);      /* Check if we got too busy. */
 +
 +              /* More endios -> process. */
 +              if (!stripe_endio_empty(&rs->sc)) {
 +                      atomic_inc(rs->stats + S_REDO);
 +                      goto redo;
 +              }
 +      } else
 +              /* No way to reconstruct data with too many devices failed. */
 +              bio_list_fail(rs, NULL, ios);
 +}
 +
 +/*
 + * Callback for region hash to dispatch
 + * delayed bios queued to recovered regions
 + * (Gets called via rh_update_states()).
 + */
 +static void dispatch_delayed_bios(void *context, struct bio_list *bl)
 +{
 +      struct raid_set *rs = context;
 +      struct bio *bio;
 +
 +      /* REMOVEME: decrement pending delayed bios counter. */
 +      bio_list_for_each(bio, bl)
 +              atomic_dec(rs->stats + S_DELAYED_BIOS);
 +
 +      /* Merge region hash private list to work list. */
 +      bio_list_merge_head(&rs->io.work, bl);
 +      bio_list_init(bl);
 +      ClearRSBandwidth(rs);
 +}
 +
 +/*************************************************************
 + * Constructor helpers
 + *************************************************************/
 +/* Calculate MB/sec. */
 +static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
 +{
 +      return to_bytes(speed * rs->set.data_devs *
 +                      rs->recover.io_size * HZ >> 10) >> 10;
 +}
 +
 +/*
 + * Discover fastest xor algorithm and # of chunks combination.
 + */
 +/* Calculate speed for algorithm and # of chunks. */
 +static INLINE unsigned xor_speed(struct stripe *stripe)
 +{
 +      unsigned r = 0;
 +      unsigned long j;
 +
 +      /* Wait for next tick. */
 +      for (j = jiffies; j == jiffies;)
 +              ;
 +
 +      /* Do xors for a full tick. */
 +      for (j = jiffies; j == jiffies;) {
 +              mb();
 +              common_xor(stripe, stripe->io.size, 0, 0);
 +              mb();
 +              r++;
 +              mb();
 +      }
 +
 +      return r;
 +}
 +
 +/* Optimize xor algorithm for this RAID set. */
 +static unsigned xor_optimize(struct raid_set *rs)
 +{
 +      unsigned chunks_max = 2, speed_max = 0;
 +      struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
 +      struct stripe *stripe;
 +
 +      BUG_ON(list_empty(&rs->recover.stripes));
 +      stripe = list_first_entry(&rs->recover.stripes, struct stripe,
 +                          lists[LIST_RECOVER]);
 +
 +      /*
 +       * Got to allow io on all chunks, so that
 +       * xor() will actually work on them.
 +       */
 +      stripe_allow_io(stripe);
 +
 +      /* Try all xor functions. */
 +      while (f-- > xor_funcs) {
 +              unsigned speed;
 +
 +              /* Set actual xor function for common_xor(). */
 +              rs->xor.f = f;
 +              rs->xor.chunks = XOR_CHUNKS_MAX + 1;
 +
 +              while (rs->xor.chunks-- > 2) {
 +                      speed = xor_speed(stripe);
 +                      if (speed > speed_max) {
 +                              speed_max = speed;
 +                              chunks_max = rs->xor.chunks;
 +                              f_max = f;
 +                      }
 +              }
 +      }
 +
 +      /* Memorize optimum parameters. */
 +      rs->xor.f = f_max;
 +      rs->xor.chunks = chunks_max;
 +      return speed_max;
 +}
 +
 +static inline int array_too_big(unsigned long fixed, unsigned long obj,
 +                                unsigned long num)
 +{
 +      return (num > (ULONG_MAX - fixed) / obj);
 +}
 +
 +static void wakeup_all_recovery_waiters(void *context)
 +{
 +}
 +
 +/*
 + * Allocate a RAID context (a RAID set)
 + */
 +static int
 +context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
 +            unsigned stripes, unsigned chunk_size, unsigned io_size,
 +            unsigned recover_io_size, unsigned raid_devs,
 +            sector_t sectors_per_dev,
 +            struct dm_target *ti, unsigned dl_parms, char **argv)
 +{
 +      int r;
 +      unsigned p;
 +      size_t len;
 +      sector_t region_size, ti_len;
 +      struct raid_set *rs = NULL;
 +      struct dm_dirty_log *dl;
 +      struct recover *rec;
 +
 +      /*
 +       * Create the dirty log
 +       *
 +       * We need to change length for the dirty log constructor,
 +       * because we want an amount of regions for all stripes derived
 +       * from the single device size, so that we can keep region
 +       * size = 2^^n independant of the number of devices
 +       */
 +      ti_len = ti->len;
 +      ti->len = sectors_per_dev;
 +      dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
 +      ti->len = ti_len;
 +      if (!dl)
 +              goto bad_dirty_log;
 +
 +      /* Chunk size *must* be smaller than region size. */
 +      region_size = dl->type->get_region_size(dl);
 +      if (chunk_size > region_size)
 +              goto bad_chunk_size;
 +
 +      /* Recover io size *must* be smaller than region size as well. */
 +      if (recover_io_size > region_size)
 +              goto bad_recover_io_size;
 +
 +      /* Size and allocate the RAID set structure. */
 +      len = sizeof(*rs->data) + sizeof(*rs->dev);
 +      if (array_too_big(sizeof(*rs), len, raid_devs))
 +              goto bad_array;
 +
 +      len = sizeof(*rs) + raid_devs * len;
 +      rs = kzalloc(len, GFP_KERNEL);
 +      if (!rs)
 +              goto bad_alloc;
 +
 +      rec = &rs->recover;
 +      atomic_set(&rs->io.in_process, 0);
 +      atomic_set(&rs->io.in_process_max, 0);
 +      rec->io_size = recover_io_size;
 +
 +      /* Pointer to data array. */
 +      rs->data = (unsigned long **)
 +                 ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
 +      rec->dl = dl;
 +      rs->set.raid_devs = p = raid_devs;
 +      rs->set.data_devs = raid_devs - raid_type->parity_devs;
 +      rs->set.raid_type = raid_type;
 +
 +      /*
 +       * Set chunk and io size and respective shifts
 +       * (used to avoid divisions)
 +       */
 +      rs->set.chunk_size = chunk_size;
 +      rs->set.chunk_mask = chunk_size - 1;
 +      rs->set.chunk_shift = ffs(chunk_size) - 1;
 +
 +      rs->set.io_size = io_size;
 +      rs->set.io_mask = io_size - 1;
 +      rs->set.io_shift = ffs(io_size) - 1;
 +      rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
 +
 +      rs->set.pages_per_io = chunk_pages(io_size);
 +      rs->set.sectors_per_dev = sectors_per_dev;
 +
 +      rs->set.ei = -1;        /* Indicate no failed device. */
 +      atomic_set(&rs->set.failed_devs, 0);
 +
 +      rs->ti = ti;
 +
 +      atomic_set(rec->io_count + IO_WORK, 0);
 +      atomic_set(rec->io_count + IO_RECOVER, 0);
 +
 +      /* Initialize io lock and queues. */
 +      spin_lock_init(&rs->io.in_lock);
 +      bio_list_init(&rs->io.in);
 +      bio_list_init(&rs->io.work);
 +
 +      init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
 +
 +      rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
 +
 +      rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios, wake_do_raid,
 +                                      wakeup_all_recovery_waiters,
 +                                      rs->ti->begin, MAX_RECOVER, dl,
 +                                      region_size, rs->recover.nr_regions);
 +      if (IS_ERR(rec->rh))
 +              goto bad_rh;
 +
 +      /* Initialize stripe cache. */
 +      r = sc_init(rs, stripes);
 +      if (r)
 +              goto bad_sc;
 +
 +      /* Create dm-io client context. */
 +      rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
 +                                                rs->set.pages_per_io);
 +      if (IS_ERR(rs->sc.dm_io_client))
 +              goto bad_dm_io_client;
 +
 +      /* REMOVEME: statistics. */
 +      stats_reset(rs);
 +      ClearRSDevelStats(rs);  /* Disnable development status. */
 +
 +      *raid_set = rs;
 +      return 0;
 +
 +bad_dirty_log:
 +      TI_ERR_RET("Error creating dirty log", -ENOMEM);
 +
 +
 +bad_chunk_size:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR("Chunk size larger than region size");
 +
 +bad_recover_io_size:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR("Recover stripe io size larger than region size");
 +
 +bad_array:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR("Arry too big");
 +
 +bad_alloc:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
 +
 +bad_rh:
 +      dm_dirty_log_destroy(dl);
 +      ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
 +      goto free_rs;
 +
 +bad_sc:
 +      ti->error = DM_MSG_PREFIX "Error creating stripe cache";
 +      goto free;
 +
 +bad_dm_io_client:
 +      ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
 +free:
 +      dm_region_hash_destroy(rec->rh);
 +      sc_exit(&rs->sc);
 +      dm_region_hash_destroy(rec->rh); /* Destroys dirty log as well. */
 +free_rs:
 +      kfree(rs);
 +      return -ENOMEM;
 +}
 +
 +/* Free a RAID context (a RAID set). */
 +static void
 +context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
 +{
 +      while (r--)
 +              dm_put_device(ti, rs->dev[r].dev);
 +
 +      dm_io_client_destroy(rs->sc.dm_io_client);
 +      sc_exit(&rs->sc);
 +      dm_region_hash_destroy(rs->recover.rh);
 +      dm_dirty_log_destroy(rs->recover.dl);
 +      kfree(rs);
 +}
 +
 +/* Create work queue and initialize work. */
 +static int rs_workqueue_init(struct raid_set *rs)
 +{
 +      struct dm_target *ti = rs->ti;
 +
 +      rs->io.wq = create_singlethread_workqueue(DAEMON);
 +      if (!rs->io.wq)
 +              TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
 +
 +      INIT_DELAYED_WORK(&rs->io.dws, do_raid);
 +      return 0;
 +}
 +
 +/* Return pointer to raid_type structure for raid name. */
 +static struct raid_type *get_raid_type(char *name)
 +{
 +      struct raid_type *r = ARRAY_END(raid_types);
 +
 +      while (r-- > raid_types) {
 +              if (!strnicmp(STR_LEN(r->name, name)))
 +                      return r;
 +      }
 +
 +      return NULL;
 +}
 +
 +/* FIXME: factor out to dm core. */
 +static int multiple(sector_t a, sector_t b, sector_t *n)
 +{
 +      sector_t r = a;
 +
 +      sector_div(r, b);
 +      *n = r;
 +      return a == r * b;
 +}
 +
 +/* Log RAID set information to kernel log. */
 +static void raid_set_log(struct raid_set *rs, unsigned speed)
 +{
 +      unsigned p;
 +      char buf[BDEVNAME_SIZE];
 +
 +      for (p = 0; p < rs->set.raid_devs; p++)
 +              DMINFO("/dev/%s is raid disk %u",
 +                     bdevname(rs->dev[p].dev->bdev, buf), p);
 +
 +      DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
 +             rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
 +             atomic_read(&rs->sc.stripes));
 +      DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
 +             rs->xor.chunks, mbpers(rs, speed));
 +      DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
 +             rs->set.data_devs, rs->set.raid_devs);
 +}
 +
 +/* Get all devices and offsets. */
 +static int
 +dev_parms(struct dm_target *ti, struct raid_set *rs,
 +        char **argv, int *p)
 +{
 +      for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
 +              int r;
 +              unsigned long long tmp;
 +              struct raid_dev *dev = rs->dev + *p;
 +              union dev_lookup dl = {.dev = dev };
 +
 +              /* Get offset and device. */
 +              r = sscanf(argv[1], "%llu", &tmp);
 +              if (r != 1)
 +                      TI_ERR("Invalid RAID device offset parameter");
 +
 +              dev->start = tmp;
-               r = dm_get_device(ti, argv[0], dev->start,
-                                 rs->set.sectors_per_dev,
-                                 dm_table_get_mode(ti->table), &dev->dev);
++              r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
++                                &dev->dev);
 +              if (r)
 +                      TI_ERR_RET("RAID device lookup failure", r);
 +
 +              r = raid_dev_lookup(rs, bynumber, &dl);
 +              if (r != -ENODEV && r < *p) {
 +                      (*p)++; /* Ensure dm_put_device() on actual device. */
 +                      TI_ERR_RET("Duplicate RAID device", -ENXIO);
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +/* Set recovery bandwidth. */
 +static INLINE void
 +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
 +{
 +      rs->recover.bandwidth = bandwidth;
 +      rs->recover.bandwidth_work = 100 / bandwidth;
 +}
 +
 +/* Handle variable number of RAID parameters. */
 +static int
 +raid_variable_parms(struct dm_target *ti, char **argv,
 +                  unsigned i, int *raid_parms,
 +                  int *chunk_size, int *chunk_size_parm,
 +                  int *stripes, int *stripes_parm,
 +                  int *io_size, int *io_size_parm,
 +                  int *recover_io_size, int *recover_io_size_parm,
 +                  int *bandwidth, int *bandwidth_parm)
 +{
 +      /* Fetch # of variable raid parameters. */
 +      if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
 +          !range_ok(*raid_parms, 0, 5))
 +              TI_ERR("Bad variable raid parameters number");
 +
 +      if (*raid_parms) {
 +              /*
 +               * If we've got variable RAID parameters,
 +               * chunk size is the first one
 +               */
 +              if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
 +                  (*chunk_size != -1 &&
 +                   (!POWER_OF_2(*chunk_size) ||
 +                    !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
 +                      TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
 +
 +              *chunk_size_parm = *chunk_size;
 +              if (*chunk_size == -1)
 +                      *chunk_size = CHUNK_SIZE;
 +
 +              /*
 +               * In case we've got 2 or more variable raid
 +               * parameters, the number of stripes is the second one
 +               */
 +              if (*raid_parms > 1) {
 +                      if (sscanf(argv[i++], "%d", stripes) != 1 ||
 +                          (*stripes != -1 &&
 +                           !range_ok(*stripes, STRIPES_MIN,
 +                                     STRIPES_MAX)))
 +                              TI_ERR("Invalid number of stripes: must "
 +                                     "be >= 8 and <= 8192");
 +              }
 +
 +              *stripes_parm = *stripes;
 +              if (*stripes == -1)
 +                      *stripes = STRIPES;
 +
 +              /*
 +               * In case we've got 3 or more variable raid
 +               * parameters, the io size is the third one.
 +               */
 +              if (*raid_parms > 2) {
 +                      if (sscanf(argv[i++], "%d", io_size) != 1 ||
 +                          (*io_size != -1 &&
 +                           (!POWER_OF_2(*io_size) ||
 +                            !range_ok(*io_size, IO_SIZE_MIN,
 +                                      min(BIO_MAX_SECTORS / 2,
 +                                      *chunk_size)))))
 +                              TI_ERR("Invalid io size; must "
 +                                     "be 2^^n and less equal "
 +                                     "min(BIO_MAX_SECTORS/2, chunk size)");
 +              } else
 +                      *io_size = *chunk_size;
 +
 +              *io_size_parm = *io_size;
 +              if (*io_size == -1)
 +                      *io_size = *chunk_size;
 +
 +              /*
 +               * In case we've got 4 variable raid parameters,
 +               * the recovery stripe io_size is the fourth one
 +               */
 +              if (*raid_parms > 3) {
 +                      if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
 +                          (*recover_io_size != -1 &&
 +                           (!POWER_OF_2(*recover_io_size) ||
 +                           !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
 +                                     BIO_MAX_SECTORS / 2))))
 +                              TI_ERR("Invalid recovery io size; must be "
 +                                     "2^^n and less equal BIO_MAX_SECTORS/2");
 +              }
 +
 +              *recover_io_size_parm = *recover_io_size;
 +              if (*recover_io_size == -1)
 +                      *recover_io_size = RECOVER_IO_SIZE;
 +
 +              /*
 +               * In case we've got 5 variable raid parameters,
 +               * the recovery io bandwidth is the fifth one
 +               */
 +              if (*raid_parms > 4) {
 +                      if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
 +                          (*bandwidth != -1 &&
 +                           !range_ok(*bandwidth, BANDWIDTH_MIN,
 +                                     BANDWIDTH_MAX)))
 +                              TI_ERR("Invalid recovery bandwidth "
 +                                     "percentage; must be > 0 and <= 100");
 +              }
 +
 +              *bandwidth_parm = *bandwidth;
 +              if (*bandwidth == -1)
 +                      *bandwidth = BANDWIDTH;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Parse optional locking parameters. */
 +static int
 +raid_locking_parms(struct dm_target *ti, char **argv,
 +                 unsigned i, int *locking_parms,
 +                 struct dm_raid45_locking_type **locking_type)
 +{
 +      *locking_parms = 0;
 +      *locking_type = &locking_none;
 +
 +      if (!strnicmp(argv[i], "none", strlen(argv[i])))
 +              *locking_parms = 1;
 +      else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
 +              *locking_type = &locking_none;
 +              *locking_parms = 2;
 +      } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
 +              *locking_type = &locking_cluster;
 +              /* FIXME: namespace. */
 +              *locking_parms = 3;
 +      }
 +
 +      return *locking_parms == 1 ? -EINVAL : 0;
 +}
 +
 +/* Set backing device information properties of RAID set. */
 +static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
 +{
 +      unsigned p, ra_pages;
 +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
 +      struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
 +
 +      /* Set read-ahead for the RAID set and the component devices. */
 +      bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
 +      ra_pages = chunks * chunk_pages(rs->set.io_size);
 +      for (p = rs->set.raid_devs; p--; ) {
 +              struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
 +
 +              q->backing_dev_info.ra_pages = ra_pages;
 +      }
 +
 +      /* Set congested function and data. */
 +      bdi->congested_fn = raid_set_congested;
 +      bdi->congested_data = rs;
 +
 +      dm_put(md);
 +}
 +
 +/* Get backing device information properties of RAID set. */
 +static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
 +{
 +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
 +
 +       *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
 +                  / stripe_pages(rs, rs->set.io_size);
 +      *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
 +                / chunk_pages(rs->set.io_size);
 +
 +      dm_put(md);
 +}
 +
 +/*
 + * Construct a RAID4/5 mapping:
 + *
 + * log_type #log_params <log_params> \
 + * raid_type [#parity_dev] #raid_variable_params <raid_params> \
 + * [locking "none"/"cluster"]
 + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
 + *
 + * log_type = "core"/"disk",
 + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
 + * log_params = [dirty_log_path] region_size [[no]sync])
 + *
 + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
 + *
 + * #parity_dev = N if raid_type = "raid4"
 + * o N = -1: pick default = last device
 + * o N >= 0 and < #raid_devs: parity device index
 + *
 + * #raid_variable_params = 0-5; raid_params (-1 = default):
 + *   [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
 + *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
 + *     and <= CHUNK_SIZE_MAX)
 + *   o #stripes is number of stripes allocated to stripe cache
 + *     (must be > 1 and < STRIPES_MAX)
 + *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
 + *   o recover_io_size (io unit size per device for recovery in sectors;
 +       must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
 + *   o %recovery_bandwith is the maximum amount spend for recovery during
 + *     application io (1-100%)
 + * If raid_variable_params = 0, defaults will be used.
 + * Any raid_variable_param can be set to -1 to apply a default
 + *
 + * #raid_devs = N (N >= 3)
 + *
 + * #dev_to_initialize = N
 + * -1: initialize parity on all devices
 + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
 + * of a failed devices content after replacement
 + *
 + * <dev_path> = device_path (eg, /dev/sdd1)
 + * <offset>   = begin at offset on <dev_path>
 + *
 + */
 +#define       MIN_PARMS       13
 +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 +{
 +      int bandwidth = BANDWIDTH, bandwidth_parm = -1,
 +          chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
 +          dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
 +          i, io_size = IO_SIZE, io_size_parm = -1,
 +          r, raid_devs, raid_parms,
 +          recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
 +          stripes = STRIPES, stripes_parm = -1;
 +      unsigned speed;
 +      sector_t tmp, sectors_per_dev;
 +      struct dm_raid45_locking_type *locking;
 +      struct raid_set *rs;
 +      struct raid_type *raid_type;
 +
 +      /* Ensure minimum number of parameters. */
 +      if (argc < MIN_PARMS)
 +              TI_ERR("Not enough parameters");
 +
 +      /* Fetch # of dirty log parameters. */
 +      if (sscanf(argv[1], "%d", &dl_parms) != 1
 +          || !range_ok(dl_parms, 1, 4711))
 +              TI_ERR("Bad dirty log parameters number");
 +
 +      /* Check raid_type. */
 +      raid_type = get_raid_type(argv[dl_parms + 2]);
 +      if (!raid_type)
 +              TI_ERR("Bad raid type");
 +
 +      /* In case of RAID4, parity drive is selectable. */
 +      parity_parm = !!(raid_type->level == raid4);
 +
 +      /* Handle variable number of RAID parameters. */
 +      r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
 +                              &raid_parms,
 +                              &chunk_size, &chunk_size_parm,
 +                              &stripes, &stripes_parm,
 +                              &io_size, &io_size_parm,
 +                              &recover_io_size, &recover_io_size_parm,
 +                              &bandwidth, &bandwidth_parm);
 +      if (r)
 +              return r;
 +
 +      r = raid_locking_parms(ti, argv,
 +                             dl_parms + parity_parm + raid_parms + 4,
 +                             &locking_parms, &locking);
 +      if (r)
 +              return r;
 +
 +      /* # of raid devices. */
 +      i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
 +      if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
 +          raid_devs < raid_type->minimal_devs)
 +              TI_ERR("Invalid number of raid devices");
 +
 +      /* In case of RAID4, check parity drive index is in limits. */
 +      if (raid_type->level == raid4) {
 +              /* Fetch index of parity device. */
 +              if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
 +                  !range_ok(pi, 0, raid_devs - 1))
 +                      TI_ERR("Invalid RAID4 parity device index");
 +      }
 +
 +      /*
 +       * Index of device to initialize starts at 0
 +       *
 +       * o -1 -> don't initialize a particular device,
 +       * o 0..raid_devs-1 -> initialize respective device
 +       *   (used for reconstruction of a replaced device)
 +       */
 +      if (sscanf
 +          (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
 +           "%d", &dev_to_init) != 1
 +          || !range_ok(dev_to_init, -1, raid_devs - 1))
 +              TI_ERR("Invalid number for raid device to initialize");
 +
 +      /* Check # of raid device arguments. */
 +      if (argc - dl_parms - parity_parm - raid_parms - 6 !=
 +          2 * raid_devs)
 +              TI_ERR("Wrong number of raid device/offset arguments");
 +
 +      /*
 +       * Check that the table length is devisable
 +       * w/o rest by (raid_devs - parity_devs)
 +       */
 +      if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
 +                    &sectors_per_dev))
 +              TI_ERR
 +                  ("Target length not divisable by number of data devices");
 +
 +      /*
 +       * Check that the device size is
 +       * devisable w/o rest by chunk size
 +       */
 +      if (!multiple(sectors_per_dev, chunk_size, &tmp))
 +              TI_ERR("Device length not divisable by chunk_size");
 +
 +      /****************************************************************
 +       * Now that we checked the constructor arguments ->
 +       * let's allocate the RAID set
 +       ****************************************************************/
 +      r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
 +                        recover_io_size, raid_devs, sectors_per_dev,
 +                        ti, dl_parms, argv);
 +      if (r)
 +              return r;
 +
 +      /*
 +       * Set these here in order to avoid passing
 +       * too many arguments to context_alloc()
 +       */
 +      rs->set.dev_to_init_parm = dev_to_init;
 +      rs->set.dev_to_init = dev_to_init;
 +      rs->set.pi_parm = pi;
 +      rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
 +      rs->set.raid_parms = raid_parms;
 +      rs->set.chunk_size_parm = chunk_size_parm;
 +      rs->set.io_size_parm = io_size_parm;
 +      rs->sc.stripes_parm = stripes_parm;
 +      rs->recover.io_size_parm = recover_io_size_parm;
 +      rs->recover.bandwidth_parm = bandwidth_parm;
 +      recover_set_bandwidth(rs, bandwidth);
 +
 +      /* Use locking type to lock stripe access. */
 +      rs->locking = locking;
 +
 +      /* Get the device/offset tupels. */
 +      argv += dl_parms + 6 + parity_parm + raid_parms;
 +      r = dev_parms(ti, rs, argv, &i);
 +      if (r)
 +              goto err;
 +
 +      /* Initialize recovery. */
 +      rs->recover.start_jiffies = jiffies;
 +      rs->recover.end_jiffies = 0;
 +      recovery_region_reset(rs);
 +
 +      /* Allow for recovery of any nosync regions. */
 +      SetRSRecover(rs);
 +
 +      /* Set backing device information (eg. read ahead). */
 +      rs_set_bdi(rs, chunk_size * 2, io_size * 4);
 +      SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
 +
 +      speed = xor_optimize(rs); /* Select best xor algorithm. */
 +
 +      /* Initialize work queue to handle this RAID set's io. */
 +      r = rs_workqueue_init(rs);
 +      if (r)
 +              goto err;
 +
 +      raid_set_log(rs, speed); /* Log information about RAID set. */
 +
 +      /*
 +       * Make sure that dm core only hands maximum io size
 +       * length down and pays attention to io boundaries.
 +       */
 +      ti->split_io = rs->set.io_size;
 +      ti->private = rs;
 +      return 0;
 +
 +err:
 +      context_free(rs, ti, i);
 +      return r;
 +}
 +
 +/*
 + * Destruct a raid mapping
 + */
 +static void raid_dtr(struct dm_target *ti)
 +{
 +      struct raid_set *rs = ti->private;
 +
 +      /* Indicate recovery end so that ios in flight drain. */
 +      ClearRSRecover(rs);
 +
 +      wake_do_raid(rs);       /* Wake daemon. */
 +      wait_ios(rs);           /* Wait for any io still being processed. */
 +      destroy_workqueue(rs->io.wq);
 +      context_free(rs, ti, rs->set.raid_devs);
 +}
 +
 +/* Queues ios to RAID sets. */
 +static inline void queue_bio(struct raid_set *rs, struct bio *bio)
 +{
 +      int wake;
 +      struct bio_list *in = &rs->io.in;
 +      spinlock_t *in_lock = &rs->io.in_lock;
 +
 +      spin_lock_irq(in_lock);
 +      wake = bio_list_empty(in);
 +      bio_list_add(in, bio);
 +      spin_unlock_irq(in_lock);
 +
 +      /* Wake daemon if input list was empty. */
 +      if (wake)
 +              wake_do_raid(rs);
 +}
 +
 +/* Raid mapping function. */
 +static int raid_map(struct dm_target *ti, struct bio *bio,
 +                  union map_info *map_context)
 +{
 +      /* I don't want to waste stripe cache capacity. */
 +      if (bio_rw(bio) == READA)
 +              return -EIO;
 +      else {
 +              struct raid_set *rs = ti->private;
 +
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats +
 +                         (bio_data_dir(bio) == WRITE ?
 +                          S_BIOS_WRITE : S_BIOS_READ));
 +
 +              /*
 +               * Get io reference to be waiting for to drop
 +               * to zero on device suspension/destruction.
 +               */
 +              io_get(rs);
 +              bio->bi_sector -= ti->begin;    /* Remap sector. */
 +              queue_bio(rs, bio);             /* Queue to the daemon. */
 +              return DM_MAPIO_SUBMITTED;      /* Handle later. */
 +      }
 +}
 +
 +/* Device suspend. */
 +static void raid_postsuspend(struct dm_target *ti)
 +{
 +      struct raid_set *rs = ti->private;
 +      struct dm_dirty_log *dl = rs->recover.dl;
 +
 +      SetRSSuspended(rs);
 +
 +      if (RSRecover(rs))
 +              dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
 +      else
 +              wake_do_raid(rs);
 +
 +      wait_ios(rs);   /* Wait for completion of all ios being processed. */
 +      if (dl->type->postsuspend && dl->type->postsuspend(dl))
 +              /* Suspend dirty log. */
 +              /* FIXME: need better error handling. */
 +              DMWARN("log suspend failed");
 +}
 +
 +/* Device resume. */
 +static void raid_resume(struct dm_target *ti)
 +{
 +      struct raid_set *rs = ti->private;
 +      struct recover *rec = &rs->recover;
 +      struct dm_dirty_log *dl = rec->dl;
 +
 +      if (dl->type->resume && dl->type->resume(dl))
 +              /* Resume dirty log. */
 +              /* FIXME: need better error handling. */
 +              DMWARN("log resume failed");
 +
 +      rec->nr_regions_to_recover =
 +          rec->nr_regions - dl->type->get_sync_count(dl);
 +
 +      ClearRSSuspended(rs);
 +
 +      /* Reset any unfinished recovery. */
 +      if (RSRecover(rs)) {
 +              recovery_region_reset(rs);
 +              dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
 +      } else
 +              wake_do_raid(rs);
 +}
 +
 +static INLINE unsigned sc_size(struct raid_set *rs)
 +{
 +      return to_sector(atomic_read(&rs->sc.stripes) *
 +                       (sizeof(struct stripe) +
 +                        (sizeof(struct stripe_set) +
 +                         (sizeof(struct page_list) +
 +                          to_bytes(rs->set.io_size) *
 +                          rs->set.raid_devs)) +
 +                        (rs->recover.
 +                         end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
 +                                                    rs->recover.
 +                                                    io_size))));
 +}
 +
 +/* REMOVEME: status output for development. */
 +static void
 +raid_devel_stats(struct dm_target *ti, char *result,
 +               unsigned *size, unsigned maxlen)
 +{
 +      unsigned chunks, stripes, sz = *size;
 +      unsigned long j;
 +      char buf[BDEVNAME_SIZE], *p;
 +      struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
 +      struct raid_set *rs = ti->private;
 +      struct recover *rec = &rs->recover;
 +      struct timespec ts;
 +
 +      DMEMIT("%s ", version);
 +      DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
 +      DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
 +
 +      for (sm = stats_map; sm < sm_end; sm++)
 +              DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
 +
 +      DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
 +      DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
 +             atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
 +             sc_size(rs));
 +
 +      j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
 +          rec->start_jiffies;
 +      jiffies_to_timespec(j, &ts);
 +      sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
 +      p = strchr(buf, '.');
 +      p[3] = 0;
 +
 +      DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
 +             (unsigned long long) rec->nr_regions_recovered,
 +             RSRegionGet(rs) ? "+" : "",
 +             (unsigned long long) rec->nr_regions_to_recover,
 +             (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
 +
 +      rs_get_ra(rs, &stripes, &chunks);
 +      DMEMIT("ra=%u/%u ", stripes, chunks);
 +
 +      *size = sz;
 +}
 +
 +static int
 +raid_status(struct dm_target *ti, status_type_t type,
 +          char *result, unsigned maxlen)
 +{
 +      unsigned i, sz = 0;
 +      char buf[BDEVNAME_SIZE];
 +      struct raid_set *rs = ti->private;
 +
 +      switch (type) {
 +      case STATUSTYPE_INFO:
 +              /* REMOVEME: statistics. */
 +              if (RSDevelStats(rs))
 +                      raid_devel_stats(ti, result, &sz, maxlen);
 +
 +              DMEMIT("%u ", rs->set.raid_devs);
 +
 +              for (i = 0; i < rs->set.raid_devs; i++)
 +                      DMEMIT("%s ",
 +                             format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
 +
 +              DMEMIT("1 ");
 +              for (i = 0; i < rs->set.raid_devs; i++) {
 +                      DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
 +
 +                      if (rs->set.raid_type->level == raid4 &&
 +                          i == rs->set.pi)
 +                              DMEMIT("p");
 +
 +                      if (rs->set.dev_to_init == i)
 +                              DMEMIT("i");
 +              }
 +
 +              break;
 +
 +      case STATUSTYPE_TABLE:
 +              sz = rs->recover.dl->type->status(rs->recover.dl, type,
 +                                                result, maxlen);
 +              DMEMIT("%s %u ", rs->set.raid_type->name,
 +                     rs->set.raid_parms);
 +
 +              if (rs->set.raid_type->level == raid4)
 +                      DMEMIT("%d ", rs->set.pi_parm);
 +
 +              if (rs->set.raid_parms)
 +                      DMEMIT("%d ", rs->set.chunk_size_parm);
 +
 +              if (rs->set.raid_parms > 1)
 +                      DMEMIT("%d ", rs->sc.stripes_parm);
 +
 +              if (rs->set.raid_parms > 2)
 +                      DMEMIT("%d ", rs->set.io_size_parm);
 +
 +              if (rs->set.raid_parms > 3)
 +                      DMEMIT("%d ", rs->recover.io_size_parm);
 +
 +              if (rs->set.raid_parms > 4)
 +                      DMEMIT("%d ", rs->recover.bandwidth_parm);
 +
 +              DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
 +
 +              for (i = 0; i < rs->set.raid_devs; i++)
 +                      DMEMIT("%s %llu ",
 +                             format_dev_t(buf,
 +                                          rs->dev[i].dev->bdev->bd_dev),
 +                             (unsigned long long) rs->dev[i].start);
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Message interface
 + */
 +enum raid_msg_actions {
 +      act_bw,                 /* Recovery bandwidth switch. */
 +      act_dev,                /* Device failure switch. */
 +      act_overwrite,          /* Stripe overwrite check. */
 +      act_read_ahead,         /* Set read ahead. */
 +      act_stats,              /* Development statistics switch. */
 +      act_sc,                 /* Stripe cache switch. */
 +
 +      act_on,                 /* Set entity on. */
 +      act_off,                /* Set entity off. */
 +      act_reset,              /* Reset entity. */
 +
 +      act_set = act_on,       /* Set # absolute. */
 +      act_grow = act_off,     /* Grow # by an amount. */
 +      act_shrink = act_reset, /* Shrink # by an amount. */
 +};
 +
 +/* Turn a delta to absolute. */
 +static int _absolute(unsigned long action, int act, int r)
 +{
 +      /* Make delta absolute. */
 +      if (test_bit(act_set, &action))
 +              ;
 +      else if (test_bit(act_grow, &action))
 +              r += act;
 +      else if (test_bit(act_shrink, &action))
 +              r = act - r;
 +      else
 +              r = -EINVAL;
 +
 +      return r;
 +}
 +
 + /* Change recovery io bandwidth. */
 +static int bandwidth_change(struct dm_msg *msg, void *context)
 +{
 +      struct raid_set *rs = context;
 +      int act = rs->recover.bandwidth;
 +      int bandwidth = DM_MSG_INT_ARG(msg);
 +
 +      if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
 +              /* Make delta bandwidth absolute. */
 +              bandwidth = _absolute(msg->action, act, bandwidth);
 +
 +              /* Check range. */
 +              if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
 +                      recover_set_bandwidth(rs, bandwidth);
 +                      return 0;
 +              }
 +      }
 +
 +      set_bit(dm_msg_ret_arg, &msg->ret);
 +      set_bit(dm_msg_ret_inval, &msg->ret);
 +      return -EINVAL;
 +}
 +
 +/* Change state of a device (running/offline). */
 +/* FIXME: this only works while recovering!. */
 +static int device_state(struct dm_msg *msg, void *context)
 +{
 +      int r;
 +      const char *str = "is already ";
 +      union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
 +      struct raid_set *rs = context;
 +
 +      r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
 +                          bymajmin : byname, &dl);
 +      if (r == -ENODEV) {
 +              DMERR("device %s is no member of this set", dl.dev_name);
 +              return r;
 +      }
 +
 +      if (test_bit(act_off, &msg->action)) {
 +              if (dev_operational(rs, r))
 +                      str = "";
 +      } else if (!dev_operational(rs, r))
 +              str = "";
 +
 +      DMINFO("/dev/%s %s%s", dl.dev_name, str,
 +             test_bit(act_off, &msg->action) ? "offline" : "running");
 +
 +      return test_bit(act_off, &msg->action) ?
 +             raid_set_check_and_degrade(rs, NULL, r) :
 +             raid_set_check_and_upgrade(rs, r);
 +}
 +
 +/* Set/reset development feature flags. */
 +static int devel_flags(struct dm_msg *msg, void *context)
 +{
 +      struct raid_set *rs = context;
 +
 +      if (test_bit(act_on, &msg->action))
 +              return test_and_set_bit(msg->spec->parm,
 +                                      &rs->io.flags) ? -EPERM : 0;
 +      else if (test_bit(act_off, &msg->action))
 +              return test_and_clear_bit(msg->spec->parm,
 +                                        &rs->io.flags) ? 0 : -EPERM;
 +      else if (test_bit(act_reset, &msg->action)) {
 +              if (test_bit(act_stats, &msg->action)) {
 +                      stats_reset(rs);
 +                      goto on;
 +              } else if (test_bit(act_overwrite, &msg->action)) {
 +on:
 +                      set_bit(msg->spec->parm, &rs->io.flags);
 +                      return 0;
 +              }
 +      }
 +
 +      return -EINVAL;
 +}
 +
 + /* Set stripe and chunk read ahead pages. */
 +static int read_ahead_set(struct dm_msg *msg, void *context)
 +{
 +      int stripes = DM_MSG_INT_ARGS(msg, 0);
 +      int chunks  = DM_MSG_INT_ARGS(msg, 1);
 +
 +      if (range_ok(stripes, 1, 512) &&
 +          range_ok(chunks, 1, 512)) {
 +              rs_set_bdi(context, stripes, chunks);
 +              return 0;
 +      }
 +
 +      set_bit(dm_msg_ret_arg, &msg->ret);
 +      set_bit(dm_msg_ret_inval, &msg->ret);
 +      return -EINVAL;
 +}
 +
 +/* Resize the stripe cache. */
 +static int stripecache_resize(struct dm_msg *msg, void *context)
 +{
 +      int act, stripes;
 +      struct raid_set *rs = context;
 +
 +      /* Deny permission in case the daemon is still shrinking!. */
 +      if (atomic_read(&rs->sc.stripes_to_shrink))
 +              return -EPERM;
 +
 +      stripes = DM_MSG_INT_ARG(msg);
 +      if (stripes > 0) {
 +              act = atomic_read(&rs->sc.stripes);
 +
 +              /* Make delta stripes absolute. */
 +              stripes = _absolute(msg->action, act, stripes);
 +
 +              /*
 +               * Check range and that the # of stripes changes.
 +               * We can grow from gere but need to leave any
 +               * shrinking to the worker for synchronization.
 +               */
 +              if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
 +                      if (stripes > act)
 +                              return sc_grow(&rs->sc, stripes - act, SC_GROW);
 +                      else if (stripes < act) {
 +                              atomic_set(&rs->sc.stripes_to_shrink,
 +                                         act - stripes);
 +                              wake_do_raid(rs);
 +                      }
 +
 +                      return 0;
 +              }
 +      }
 +
 +      set_bit(dm_msg_ret_arg, &msg->ret);
 +      set_bit(dm_msg_ret_inval, &msg->ret);
 +      return -EINVAL;
 +}
 +
 +/* Parse the RAID message action. */
 +/*
 + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'    # e.g 'ba se 50'
 + * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
 + * "o[verwrite]  {on,of[f],r[eset]}'          # e.g. 'o of'
 + * "r[ead_ahead] set #stripes #chunks         # e.g. 'r se 3 2'
 + * 'sta[tistics] {on,of[f],r[eset]}'          # e.g. 'stat of'
 + * 'str[ipecache] {se[t],g[row],sh[rink]} #'  # e.g. 'stripe set 1024'
 + *
 + */
 +static int
 +raid_message(struct dm_target *ti, unsigned argc, char **argv)
 +{
 +      /* Variables to store the parsed parameters im. */
 +      static int i[2];
 +      static unsigned long *i_arg[] = {
 +              (unsigned long *) i + 0,
 +              (unsigned long *) i + 1,
 +      };
 +      static char *p;
 +      static unsigned long *p_arg[] = { (unsigned long *) &p };
 +
 +      /* Declare all message option strings. */
 +      static char *str_sgs[] = { "set", "grow", "shrink" };
 +      static char *str_dev[] = { "running", "offline" };
 +      static char *str_oor[] = { "on", "off", "reset" };
 +
 +      /* Declare all actions. */
 +      static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
 +      static unsigned long act_oor[] = { act_on, act_off, act_reset };
 +
 +      /* Bandwidth option. */
 +      static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
 +      static struct dm_message_argument bw_args = {
 +              1, i_arg, { dm_msg_int_t }
 +      };
 +
 +      /* Device option. */
 +      static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
 +      static struct dm_message_argument dev_args = {
 +              1, p_arg, { dm_msg_base_t }
 +      };
 +
 +      /* Read ahead option. */
 +      static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
 +      static struct dm_message_argument ra_args = {
 +              2, i_arg, { dm_msg_int_t, dm_msg_int_t }
 +      };
 +
 +      static struct dm_message_argument null_args = {
 +              0, NULL, { dm_msg_int_t }
 +      };
 +
 +      /* Overwrite and statistics option. */
 +      static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
 +
 +      /* Sripecache option. */
 +      static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
 +
 +      /* Declare messages. */
 +      static struct dm_msg_spec specs[] = {
 +              { "bandwidth", act_bw, &bw_opt, &bw_args,
 +                0, bandwidth_change },
 +              { "device", act_dev, &dev_opt, &dev_args,
 +                0, device_state },
 +              { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
 +                RS_CHECK_OVERWRITE, devel_flags },
 +              { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
 +                0, read_ahead_set },
 +              { "statistics", act_stats, &ovr_stats_opt, &null_args,
 +                RS_DEVEL_STATS, devel_flags },
 +              { "stripecache", act_sc, &stripe_opt, &bw_args,
 +                0, stripecache_resize },
 +      };
 +
 +      /* The message for the parser. */
 +      struct dm_msg msg = {
 +              .num_specs = ARRAY_SIZE(specs),
 +              .specs = specs,
 +      };
 +
 +      return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
 +}
 +/*
 + * END message interface
 + */
 +
 +static struct target_type raid_target = {
 +      .name = "raid45",
 +      .version = {1, 0, 0},
 +      .module = THIS_MODULE,
 +      .ctr = raid_ctr,
 +      .dtr = raid_dtr,
 +      .map = raid_map,
 +      .postsuspend = raid_postsuspend,
 +      .resume = raid_resume,
 +      .status = raid_status,
 +      .message = raid_message,
 +};
 +
 +static void init_exit(const char *bad_msg, const char *good_msg, int r)
 +{
 +      if (r)
 +              DMERR("Failed to %sregister target [%d]", bad_msg, r);
 +      else
 +              DMINFO("%s %s", good_msg, version);
 +}
 +
 +static int __init dm_raid_init(void)
 +{
 +      int r;
 +
 +      r = dm_register_target(&raid_target);
 +      init_exit("", "initialized", r);
 +      return r;
 +}
 +
 +static void __exit dm_raid_exit(void)
 +{
 +      dm_unregister_target(&raid_target);
 +      init_exit("un", "exit", 0);
 +}
 +
 +/* Module hooks. */
 +module_init(dm_raid_init);
 +module_exit(dm_raid_exit);
 +
 +MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
 +MODULE_LICENSE("GPL");
Simple merge
diff --cc drivers/md/dm.c
Simple merge
@@@ -762,13 -809,8 +809,12 @@@ int uvc_query_v4l2_ctrl(struct uvc_vide
        struct uvc_control_mapping *mapping;
        struct uvc_menu_info *menu;
        unsigned int i;
-       __u8 *data;
        int ret;
  
 +      if ((chain->dev->quirks & UVC_QUIRK_HUE_EPIPE) &&
 +              (v4l2_ctrl->id == V4L2_CID_HUE))
 +              return -EINVAL;
 +
        ctrl = uvc_find_control(chain, v4l2_ctrl->id, &mapping);
        if (ctrl == NULL)
                return -EINVAL;
Simple merge
Simple merge
diff --cc drivers/mtd/maps/omap_nor.c
index ead0b2f,e69de29..0000000
deleted file mode 100644,100644
+++ /dev/null
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -60,11 -60,7 +60,11 @@@ static const struct e1000_info *igb_inf
        [board_82575] = &e1000_82575_info,
  };
  
 +static int entropy = 0;
 +module_param(entropy, int, 0);
 +MODULE_PARM_DESC(entropy, "Allow igb to populate the /dev/random entropy pool");
 +
- static struct pci_device_id igb_pci_tbl[] = {
+ static DEFINE_PCI_DEVICE_TABLE(igb_pci_tbl) = {
        { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_COPPER), board_82575 },
        { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_FIBER), board_82575 },
        { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_SERDES), board_82575 },
Simple merge
Simple merge
Simple merge
@@@ -2312,17 -2354,11 +2350,11 @@@ static struct sk_buff *receive_new(stru
  
        skb = re->skb;
        sky2_rx_unmap_skb(sky2->hw->pdev, re);
        prefetch(skb->data);
-       re->skb = nskb;
-       if (sky2_rx_map_skb(sky2->hw->pdev, re, hdr_space)) {
-               dev_kfree_skb(nskb);
-               re->skb = skb;
-               return NULL;
-       }
+       *re = nre;
  
        if (skb_shinfo(skb)->nr_frags)
 -              skb_put_frags(skb, hdr_space, length);
 +              skb_put_frags(sky2, skb, hdr_space, length);
        else
                skb_put(skb, length);
        return skb;
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
  
  #include "tg3.h"
  
 +static int entropy = 0;
 +module_param(entropy, int, 0);
 +MODULE_PARM_DESC(entropy, "Allow tg3 to populate the /dev/random entropy pool");
 +
  #define DRV_MODULE_NAME               "tg3"
- #define PFX DRV_MODULE_NAME   ": "
- #define DRV_MODULE_VERSION    "3.106"
- #define DRV_MODULE_RELDATE    "January 12, 2010"
+ #define DRV_MODULE_VERSION    "3.108"
+ #define DRV_MODULE_RELDATE    "February 17, 2010"
  
  #define TG3_DEF_MAC_MODE      0
  #define TG3_DEF_RX_MODE               0
Simple merge
Simple merge
@@@ -112,7 -112,9 +112,8 @@@ config AIRO_C
        depends on PCMCIA && (BROKEN || !M32R)
        select WIRELESS_EXT
        select WEXT_SPY
+       select WEXT_PRIV
        select CRYPTO
 -      select CRYPTO_AES
        ---help---
          This is the standard Linux driver to support Cisco/Aironet PCMCIA
          802.11 wireless cards.  This driver is the same as the Aironet
Simple merge
Simple merge
@@@ -3107,17 -3293,17 +3293,17 @@@ static void __devinit hpsa_interrupt_mo
  default_int_mode:
  #endif                                /* CONFIG_PCI_MSI */
        /* if we get here we're going to use the default interrupt mode */
-       h->intr[SIMPLE_MODE_INT] = pdev->irq;
-       return;
+       h->intr[PERF_MODE_INT] = pdev->irq;
  }
  
 -static int hpsa_pci_init(struct ctlr_info *h, struct pci_dev *pdev)
 +static int __devinit hpsa_pci_init(struct ctlr_info *h, struct pci_dev *pdev)
  {
        ushort subsystem_vendor_id, subsystem_device_id, command;
-       __u32 board_id, scratchpad = 0;
-       __u64 cfg_offset;
-       __u32 cfg_base_addr;
-       __u64 cfg_base_addr_index;
+       u32 board_id, scratchpad = 0;
+       u64 cfg_offset;
+       u32 cfg_base_addr;
+       u64 cfg_base_addr_index;
+       u32 trans_offset;
        int i, prod_index, err;
  
        subsystem_vendor_id = pdev->subsystem_vendor;
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
  #include <linux/platform_device.h>
  #include <linux/workqueue.h>
  #include <linux/mutex.h>
+ #include <linux/pm_runtime.h>
 +#ifdef CONFIG_KDB_USB
 +#include <linux/kdb.h>
 +#endif
  
  #include <linux/usb.h>
  
@@@ -289,14 -286,7 +289,15 @@@ struct hc_driver 
                 */
        int     (*update_hub_device)(struct usb_hcd *, struct usb_device *hdev,
                        struct usb_tt *tt, gfp_t mem_flags);
+       int     (*reset_device)(struct usb_hcd *, struct usb_device *);
 +
 +#ifdef CONFIG_KDB_USB
 +      /* KDB poll function for this HC */
 +      int     (*kdb_poll_char)(struct urb *urb);
 +      void    (*kdb_completion)(struct urb *urb);
 +      kdb_hc_keyboard_attach_t        kdb_hc_keyboard_attach;
 +      kdb_hc_keyboard_detach_t        kdb_hc_keyboard_detach;
 +#endif /* CONFIG_KDB_USB */
  };
  
  extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb);
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc fs/Kconfig
Simple merge
diff --cc fs/Makefile
Simple merge
diff --cc fs/bio.c
Simple merge
Simple merge
Simple merge
diff --cc fs/exec.c
Simple merge
diff --cc fs/ext3/file.c
  #include <linux/time.h>
  #include <linux/fs.h>
  #include <linux/jbd.h>
+ #include <linux/quotaops.h>
  #include <linux/ext3_fs.h>
  #include <linux/ext3_jbd.h>
 +#include "namei.h"
  #include "xattr.h"
  #include "acl.h"
 +#include "nfs4acl.h"
  
  /*
   * Called when an inode is released. Note that this is different
@@@ -589,15 -588,12 +589,15 @@@ got
                sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
  
        ret = inode;
-       if (vfs_dq_alloc_inode(inode)) {
-               err = -EDQUOT;
+       dquot_initialize(inode);
+       err = dquot_alloc_inode(inode);
+       if (err)
                goto fail_drop;
-       }
  
 -      err = ext3_init_acl(handle, inode, dir);
 +      if (test_opt(sb, NFS4ACL))
 +              err = ext3_nfs4acl_init(handle, inode, dir);
 +      else
 +              err = ext3_init_acl(handle, inode, dir);
        if (err)
                goto fail_free_drop;
  
diff --cc fs/ext3/inode.c
Simple merge
diff --cc fs/ext3/namei.c
Simple merge
diff --cc fs/ext3/super.c
@@@ -36,9 -36,7 +36,8 @@@
  #include <linux/namei.h>
  #include <linux/quotaops.h>
  #include <linux/seq_file.h>
 +#include <linux/nfs4acl.h>
  #include <linux/log2.h>
- #include <linux/precache.h>
  
  #include <asm/uaccess.h>
  
@@@ -534,13 -528,8 +533,15 @@@ static void destroy_inodecache(void
  static void ext3_clear_inode(struct inode *inode)
  {
        struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
 +#ifdef CONFIG_EXT3_FS_NFS4ACL
 +      if (EXT3_I(inode)->i_nfs4acl &&
 +                      EXT3_I(inode)->i_nfs4acl != EXT3_NFS4ACL_NOT_CACHED) {
 +              nfs4acl_put(EXT3_I(inode)->i_nfs4acl);
 +              EXT3_I(inode)->i_nfs4acl = EXT3_NFS4ACL_NOT_CACHED;
 +      }
 +#endif
+       dquot_drop(inode);
        ext3_discard_reservation(inode);
        EXT3_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
@@@ -1725,11 -1694,8 +1724,11 @@@ static int ext3_fill_super (struct supe
                            NULL, 0))
                goto failed_mount;
  
 -      sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 -              (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 +      sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-       if (sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL)
++      if (test_opt(sb, POSIX_ACL))
 +              sb->s_flags |= MS_POSIXACL;
-       if (sbi->s_mount_opt & EXT3_MOUNT_NFS4ACL)
++      if (test_opt(sb, NFS4ACL))
 +              sb->s_flags |= MS_POSIXACL | MS_WITHAPPEND;
  
        if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
            (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@@ -2603,15 -2562,11 +2602,15 @@@ static int ext3_remount (struct super_b
                goto restore_opts;
        }
  
-       if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+       if (test_opt(sb, ABORT))
                ext3_abort(sb, __func__, "Abort forced by user");
  
 -      sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 -              (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 +      sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-       if (sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL)
++      if (test_opt(sb, POSIX_ACL))
 +              sb->s_flags |= MS_POSIXACL;
-       if (sbi->s_mount_opt & EXT3_MOUNT_NFS4ACL)
++      if (test_opt(sb, NFS4ACL))
 +              sb->s_flags |= MS_POSIXACL;
 +
  
        es = sbi->s_es;
  
diff --cc fs/ext3/xattr.c
Simple merge
Simple merge
diff --cc fs/namei.c
@@@ -1352,21 -1317,9 +1322,21 @@@ static int may_delete(struct inode *dir
                return -ENOENT;
  
        BUG_ON(victim->d_parent->d_inode != dir);
-       audit_inode_child(victim->d_name.name, victim, dir);
+       audit_inode_child(victim, dir);
  
 -      error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 +      if (dir->i_op->may_delete) {
 +              if (IS_RDONLY(dir))
 +                      return -EROFS;
 +              if (IS_IMMUTABLE(dir))
 +                      return -EACCES;
 +              error = dir->i_op->may_delete(dir, victim->d_inode);
 +              if (!error)
 +                      error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
 +      } else {
 +              error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 +              if (!error && check_sticky(dir, victim->d_inode))
 +                      error = -EPERM;
 +      }
        if (error)
                return error;
        if (IS_APPEND(dir))
@@@ -1404,37 -1355,9 +1374,21 @@@ static inline int may_create(struct ino
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
 -      return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 +
 +      if (dir->i_op->may_create) {
 +              if (IS_RDONLY(dir))
 +                      return -EROFS;
 +              if (IS_IMMUTABLE(dir))
 +                      return -EACCES;
 +              error = dir->i_op->may_create(dir, isdir);
 +              if (!error)
 +                      error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
 +      } else
 +              error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 +
 +      return error;
  }
  
- /* 
-  * O_DIRECTORY translates into forcing a directory lookup.
-  */
- static inline int lookup_flags(unsigned int f)
- {
-       unsigned long retval = LOOKUP_FOLLOW;
-       if (f & O_NOFOLLOW)
-               retval &= ~LOOKUP_FOLLOW;
-       
-       if (f & O_DIRECTORY)
-               retval |= LOOKUP_DIRECTORY;
-       return retval;
- }
  /*
   * p1 and p2 should be directories on the same fs.
   */
diff --cc fs/nfs/Kconfig
Simple merge
Simple merge
diff --cc fs/nfs/file.c
Simple merge
diff --cc fs/nfs/inode.c
Simple merge
Simple merge
diff --cc fs/nfs/write.c
@@@ -509,10 -438,10 +509,11 @@@ nfs_mark_request_commit(struct nfs_pag
        radix_tree_tag_set(&nfsi->nfs_page_tree,
                        req->wb_index,
                        NFS_PAGE_TAG_COMMIT);
+       nfsi->ncommit++;
        spin_unlock(&inode->i_lock);
        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 -      inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 +      inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
 +                      BDI_RECLAIMABLE);
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
  }
  
diff --cc fs/nfsd/vfs.c
@@@ -377,16 -404,6 +404,15 @@@ nfsd_setattr(struct svc_rqst *rqstp, st
                        put_write_access(inode);
                        goto out_nfserr;
                }
-               vfs_dq_init(inode);
 +
 +              /*
 +               * Tell a Hierarchical Storage Manager (e.g. via DMAPI) to
 +               * return EAGAIN when an action would take minutes instead of
 +               * milliseconds so that NFS can reply to the client with
 +               * NFSERR_JUKEBOX instead of blocking an nfsd thread.
 +               */
 +              if (rqstp->rq_vers >= 3)
 +                      iap->ia_valid |= ATTR_NO_BLOCK;
        }
  
        /* sanitize the mode change */
Simple merge
diff --cc fs/ocfs2/aops.c
Simple merge
Simple merge
Simple merge
diff --cc fs/ocfs2/dir.c
Simple merge
diff --cc fs/ocfs2/file.c
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -54,10 -54,9 +54,11 @@@ struct ocfs2_alloc_context 
        u64    ac_last_group;
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                 is the same as ~0 - unlimited */
 +
 +      struct ocfs2_alloc_reservation  *ac_resv;
  };
  
+ void ocfs2_init_steal_slots(struct ocfs2_super *osb);
  void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
  static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
  {
Simple merge
Simple merge
diff --cc fs/proc/array.c
Simple merge
diff --cc fs/proc/base.c
Simple merge
Simple merge
diff --cc fs/super.c
@@@ -562,13 -556,19 +556,13 @@@ out
        return err;
  }
  
 -/**
 - *    do_remount_sb - asks filesystem to change mount options.
 - *    @sb:    superblock in question
 - *    @flags: numeric part of options
 - *    @data:  the rest of options
 - *      @force: whether or not to force the change
 - *
 - *    Alters the mount options of a mounted file system.
 - */
 -int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 +#define REMOUNT_FORCE         1
 +#define REMOUNT_SHRINK_DCACHE 2
 +
 +static int __do_remount_sb(struct super_block *sb, int flags, void *data, int rflags)
  {
        int retval;
-       int remount_rw;
+       int remount_rw, remount_ro;
  
        if (sb->s_frozen != SB_UNFROZEN)
                return -EBUSY;
  
        if (flags & MS_RDONLY)
                acct_auto_close(sb);
 -      shrink_dcache_sb(sb);
 +      if (rflags & REMOUNT_SHRINK_DCACHE)
 +              shrink_dcache_sb(sb);
        sync_filesystem(sb);
  
+       remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+       remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
-       if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+       if (remount_ro) {
 -              if (force)
 +              if (rflags & REMOUNT_FORCE)
                        mark_files_ro(sb);
                else if (!fs_may_remount_ro(sb))
                        return -EBUSY;
diff --cc fs/xfs/Makefile
@@@ -107,11 -105,9 +107,10 @@@ xfs-y                            += $(addprefix $(XFS_LINUX)/, 
                                   xfs_globals.o \
                                   xfs_ioctl.o \
                                   xfs_iops.o \
-                                  xfs_lrw.o \
                                   xfs_super.o \
                                   xfs_sync.o \
 -                                 xfs_xattr.o)
 +                                 xfs_xattr.o \
 +                                 xfs_ksyms.o)
  
  # Objects in support/
  xfs-y                         += $(addprefix support/, \
index 81ca41d,0000000..9332824
mode 100644,000000..100644
--- /dev/null
@@@ -1,3327 -1,0 +1,3327 @@@
 +/*
 + * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 + * All Rights Reserved.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License as
 + * published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope that it would be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write the Free Software Foundation,
 + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 + */
 +#include "xfs.h"
 +#include "xfs_fs.h"
 +#include "xfs_types.h"
 +#include "xfs_bit.h"
 +#include "xfs_log.h"
 +#include "xfs_inum.h"
 +#include "xfs_trans.h"
 +#include "xfs_sb.h"
 +#include "xfs_ag.h"
 +#include "xfs_dir2.h"
 +#include "xfs_alloc.h"
 +#include "xfs_dmapi.h"
 +#include "xfs_mount.h"
 +#include "xfs_da_btree.h"
 +#include "xfs_bmap_btree.h"
 +#include "xfs_alloc_btree.h"
 +#include "xfs_ialloc_btree.h"
 +#include "xfs_dir2_sf.h"
 +#include "xfs_attr_sf.h"
 +#include "xfs_dinode.h"
 +#include "xfs_inode.h"
 +#include "xfs_btree.h"
 +#include "xfs_ialloc.h"
 +#include "xfs_itable.h"
 +#include "xfs_bmap.h"
 +#include "xfs_rw.h"
 +#include "xfs_acl.h"
 +#include "xfs_attr.h"
 +#include "xfs_attr_leaf.h"
 +#include "xfs_inode_item.h"
 +#include "xfs_vnodeops.h"
 +#include <dmapi.h>
 +#include <dmapi_kern.h>
 +#include "xfs_dm.h"
 +
 +#include <linux/mount.h>
 +
 +#define MAXNAMLEN MAXNAMELEN
 +
 +#define MIN_DIO_SIZE(mp)              ((mp)->m_sb.sb_sectsize)
 +#define MAX_DIO_SIZE(mp)              (INT_MAX & ~(MIN_DIO_SIZE(mp) - 1))
 +
 +static void up_rw_sems(struct inode *ip, int flags)
 +{
 +      if (flags & DM_FLAGS_IALLOCSEM_WR)
 +              up_write(&ip->i_alloc_sem);
 +      if (flags & DM_FLAGS_IMUX)
 +              mutex_unlock(&ip->i_mutex);
 +}
 +
 +static void down_rw_sems(struct inode *ip, int flags)
 +{
 +      if (flags & DM_FLAGS_IMUX)
 +              mutex_lock(&ip->i_mutex);
 +      if (flags & DM_FLAGS_IALLOCSEM_WR)
 +              down_write(&ip->i_alloc_sem);
 +}
 +
 +
 +/* Structure used to hold the on-disk version of a dm_attrname_t.  All
 +   on-disk attribute names start with the 8-byte string "SGI_DMI_".
 +*/
 +
 +typedef struct        {
 +      char    dan_chars[DMATTR_PREFIXLEN + DM_ATTR_NAME_SIZE + 1];
 +} dm_dkattrname_t;
 +
 +/* Structure used by xfs_dm_get_bulkall(), used as the "private_data"
 + * that we want xfs_bulkstat to send to our formatter.
 + */
 +typedef struct {
 +      dm_fsid_t       fsid;
 +      void __user     *laststruct;
 +      dm_dkattrname_t attrname;
 +} dm_bulkstat_one_t;
 +
 +/* In the on-disk inode, DMAPI attribute names consist of the user-provided
 +   name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
 +   changed!
 +*/
 +
 +static        const   char    dmattr_prefix[DMATTR_PREFIXLEN + 1] = DMATTR_PREFIXSTRING;
 +
 +static        dm_size_t  dm_min_dio_xfer = 0; /* direct I/O disabled for now */
 +
 +
 +/* See xfs_dm_get_dmattr() for a description of why this is needed. */
 +
 +#define XFS_BUG_KLUDGE        256     /* max size of an in-inode attribute value */
 +
 +#define DM_MAX_ATTR_BYTES_ON_DESTROY  256
 +
 +#define DM_STAT_SIZE(dmtype,namelen)  \
 +      (sizeof(dmtype) + sizeof(dm_handle_t) + namelen)
 +
 +#define DM_STAT_ALIGN         (sizeof(__uint64_t))
 +
 +/* DMAPI's E2BIG == EA's ERANGE */
 +#define DM_EA_XLATE_ERR(err) { if (err == ERANGE) err = E2BIG; }
 +
 +static inline size_t dm_stat_align(size_t size)
 +{
 +      return (size + (DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
 +}
 +
 +static inline size_t dm_stat_size(size_t namelen)
 +{
 +      return dm_stat_align(sizeof(dm_stat_t) + sizeof(dm_handle_t) + namelen);
 +}
 +
 +/*
 + *    xfs_dm_send_data_event()
 + *
 + *    Send data event to DMAPI.  Drop IO lock (if specified) before
 + *    the dm_send_data_event() call and reacquire it afterwards.
 + */
 +int
 +xfs_dm_send_data_event(
 +      dm_eventtype_t  event,
 +      xfs_inode_t     *ip,
 +      xfs_off_t       offset,
 +      size_t          length,
 +      int             flags,
 +      int             *lock_flags)
 +{
 +      struct inode    *inode = &ip->i_vnode;
 +      int             error;
 +      uint16_t        dmstate;
 +
 +      /* Returns positive errors to XFS */
 +
 +      do {
 +              dmstate = ip->i_d.di_dmstate;
 +              if (lock_flags)
 +                      xfs_iunlock(ip, *lock_flags);
 +
 +              up_rw_sems(inode, flags);
 +
 +              error = dm_send_data_event(event, inode, DM_RIGHT_NULL,
 +                              offset, length, flags);
 +              error = -error; /* DMAPI returns negative errors */
 +
 +              down_rw_sems(inode, flags);
 +
 +              if (lock_flags)
 +                      xfs_ilock(ip, *lock_flags);
 +      } while (!error && (ip->i_d.di_dmstate != dmstate));
 +
 +      return error;
 +}
 +
 +/*    prohibited_mr_events
 + *
 + *    Return event bits representing any events which cannot have managed
 + *    region events set due to memory mapping of the file.  If the maximum
 + *    protection allowed in any pregion includes PROT_WRITE, and the region
 + *    is shared and not text, then neither READ nor WRITE events can be set.
 + *    Otherwise if the file is memory mapped, no READ event can be set.
 + *
 + */
 +STATIC int
 +prohibited_mr_events(
 +      struct address_space *mapping)
 +{
 +      int prohibited = (1 << DM_EVENT_READ);
 +
 +      if (!mapping_mapped(mapping))
 +              return 0;
 +
 +      spin_lock(&mapping->i_mmap_lock);
 +      if (mapping_writably_mapped(mapping))
 +              prohibited |= (1 << DM_EVENT_WRITE);
 +      spin_unlock(&mapping->i_mmap_lock);
 +
 +      return prohibited;
 +}
 +
 +#ifdef        DEBUG_RIGHTS
 +STATIC int
 +xfs_vp_to_hexhandle(
 +      struct inode    *inode,
 +      u_int           type,
 +      char            *buffer)
 +{
 +      dm_handle_t     handle;
 +      u_char          *ip;
 +      int             length;
 +      int             error;
 +      int             i;
 +
 +      /*
 +       * XXX: dm_vp_to_handle doesn't exist.
 +       *      Looks like this debug code is rather dead.
 +       */
 +      if ((error = dm_vp_to_handle(inode, &handle)))
 +              return(error);
 +
 +      if (type == DM_FSYS_OBJ) {      /* a filesystem handle */
 +              length = DM_FSHSIZE;
 +      } else {
 +              length = DM_HSIZE(handle);
 +      }
 +      for (ip = (u_char *)&handle, i = 0; i < length; i++) {
 +              *buffer++ = "0123456789abcdef"[ip[i] >> 4];
 +              *buffer++ = "0123456789abcdef"[ip[i] & 0xf];
 +      }
 +      *buffer = '\0';
 +      return(0);
 +}
 +#endif        /* DEBUG_RIGHTS */
 +
 +
 +
 +
 +/* Copy in and validate an attribute name from user space.  It should be a
 +   string of at least one and at most DM_ATTR_NAME_SIZE characters.  Because
 +   the dm_attrname_t structure doesn't provide room for the trailing NULL
 +   byte, we just copy in one extra character and then zero it if it
 +   happens to be non-NULL.
 +*/
 +
 +STATIC int
 +xfs_copyin_attrname(
 +      dm_attrname_t   __user *from,   /* dm_attrname_t in user space */
 +      dm_dkattrname_t *to)            /* name buffer in kernel space */
 +{
 +      int error = 0;
 +      size_t len;
 +
 +      strcpy(to->dan_chars, dmattr_prefix);
 +
 +        len = strnlen_user((char __user *)from, DM_ATTR_NAME_SIZE);
 +        if (len == 0)
 +            error = EFAULT;
 +        else {
 +         if (copy_from_user(&to->dan_chars[DMATTR_PREFIXLEN], from, len))
 +              to->dan_chars[sizeof(to->dan_chars) - 1] = '\0';
 +         else if (to->dan_chars[DMATTR_PREFIXLEN] == '\0')
 +              error = EINVAL;
 +         else
 +              to->dan_chars[DMATTR_PREFIXLEN + len - 1] = '\0';
 +        }
 +
 +      return error;
 +}
 +
 +
 +/*
 + * Convert the XFS flags into their DMAPI flag equivalent for export
 + */
 +STATIC uint
 +_xfs_dic2dmflags(
 +      __uint16_t              di_flags)
 +{
 +      uint                    flags = 0;
 +
 +      if (di_flags & XFS_DIFLAG_ANY) {
 +              if (di_flags & XFS_DIFLAG_REALTIME)
 +                      flags |= DM_XFLAG_REALTIME;
 +              if (di_flags & XFS_DIFLAG_PREALLOC)
 +                      flags |= DM_XFLAG_PREALLOC;
 +              if (di_flags & XFS_DIFLAG_IMMUTABLE)
 +                      flags |= DM_XFLAG_IMMUTABLE;
 +              if (di_flags & XFS_DIFLAG_APPEND)
 +                      flags |= DM_XFLAG_APPEND;
 +              if (di_flags & XFS_DIFLAG_SYNC)
 +                      flags |= DM_XFLAG_SYNC;
 +              if (di_flags & XFS_DIFLAG_NOATIME)
 +                      flags |= DM_XFLAG_NOATIME;
 +              if (di_flags & XFS_DIFLAG_NODUMP)
 +                      flags |= DM_XFLAG_NODUMP;
 +      }
 +      return flags;
 +}
 +
 +STATIC uint
 +xfs_ip2dmflags(
 +      xfs_inode_t     *ip)
 +{
 +      return _xfs_dic2dmflags(ip->i_d.di_flags) |
 +                      (XFS_IFORK_Q(ip) ? DM_XFLAG_HASATTR : 0);
 +}
 +
 +STATIC uint
 +xfs_dic2dmflags(
 +      xfs_dinode_t    *dip)
 +{
 +      return _xfs_dic2dmflags(be16_to_cpu(dip->di_flags)) |
 +                      (XFS_DFORK_Q(dip) ? DM_XFLAG_HASATTR : 0);
 +}
 +
 +/*
 + * This copies selected fields in an inode into a dm_stat structure.  Because
 + * these fields must return the same values as they would in stat(), the
 + * majority of this code was copied directly from xfs_getattr().  Any future
 + * changes to xfs_gettattr() must also be reflected here.
 + */
 +STATIC void
 +xfs_dip_to_stat(
 +      xfs_mount_t             *mp,
 +      xfs_ino_t               ino,
 +      xfs_dinode_t            *dip,
 +      dm_stat_t               *buf)
 +{
 +      xfs_dinode_t    *dic = dip;
 +
 +      /*
 +       * The inode format changed when we moved the link count and
 +       * made it 32 bits long.  If this is an old format inode,
 +       * convert it in memory to look like a new one.  If it gets
 +       * flushed to disk we will convert back before flushing or
 +       * logging it.  We zero out the new projid field and the old link
 +       * count field.  We'll handle clearing the pad field (the remains
 +       * of the old uuid field) when we actually convert the inode to
 +       * the new format. We don't change the version number so that we
 +       * can distinguish this from a real new format inode.
 +       */
 +      if (dic->di_version == 1) {
 +              buf->dt_nlink = be16_to_cpu(dic->di_onlink);
 +              /*buf->dt_xfs_projid = 0;*/
 +      } else {
 +              buf->dt_nlink = be32_to_cpu(dic->di_nlink);
 +              /*buf->dt_xfs_projid = be16_to_cpu(dic->di_projid);*/
 +      }
 +      buf->dt_ino = ino;
 +      buf->dt_dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
 +      buf->dt_mode = be16_to_cpu(dic->di_mode);
 +      buf->dt_uid = be32_to_cpu(dic->di_uid);
 +      buf->dt_gid = be32_to_cpu(dic->di_gid);
 +      buf->dt_size = be64_to_cpu(dic->di_size);
 +      buf->dt_atime = be32_to_cpu(dic->di_atime.t_sec);
 +      buf->dt_mtime = be32_to_cpu(dic->di_mtime.t_sec);
 +      buf->dt_ctime = be32_to_cpu(dic->di_ctime.t_sec);
 +      buf->dt_xfs_xflags = xfs_dic2dmflags(dip);
 +      buf->dt_xfs_extsize =
 +              be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
 +      buf->dt_xfs_extents = be32_to_cpu(dic->di_nextents);
 +      buf->dt_xfs_aextents = be16_to_cpu(dic->di_anextents);
 +      buf->dt_xfs_igen = be32_to_cpu(dic->di_gen);
 +      buf->dt_xfs_dmstate = be16_to_cpu(dic->di_dmstate);
 +
 +      switch (dic->di_format) {
 +      case XFS_DINODE_FMT_DEV:
 +              buf->dt_rdev = xfs_dinode_get_rdev(dic);
 +              buf->dt_blksize = BLKDEV_IOSIZE;
 +              buf->dt_blocks = 0;
 +              break;
 +      case XFS_DINODE_FMT_LOCAL:
 +      case XFS_DINODE_FMT_UUID:
 +              buf->dt_rdev = 0;
 +              buf->dt_blksize = mp->m_sb.sb_blocksize;
 +              buf->dt_blocks = 0;
 +              break;
 +      case XFS_DINODE_FMT_EXTENTS:
 +      case XFS_DINODE_FMT_BTREE:
 +              buf->dt_rdev = 0;
 +              buf->dt_blksize = mp->m_sb.sb_blocksize;
 +              buf->dt_blocks =
 +                      XFS_FSB_TO_BB(mp, be64_to_cpu(dic->di_nblocks));
 +              break;
 +      }
 +
 +      memset(&buf->dt_pad1, 0, sizeof(buf->dt_pad1));
 +      memset(&buf->dt_pad2, 0, sizeof(buf->dt_pad2));
 +      memset(&buf->dt_pad3, 0, sizeof(buf->dt_pad3));
 +
 +      /* Finally fill in the DMAPI specific fields */
 +      buf->dt_pers = 0;
 +      buf->dt_change = 0;
 +      buf->dt_nevents = DM_EVENT_MAX;
 +      buf->dt_emask = be32_to_cpu(dic->di_dmevmask);
 +      buf->dt_dtime = be32_to_cpu(dic->di_ctime.t_sec);
 +      /* Set if one of READ, WRITE or TRUNCATE bits is set in emask */
 +      buf->dt_pmanreg = (DMEV_ISSET(DM_EVENT_READ, buf->dt_emask) ||
 +                      DMEV_ISSET(DM_EVENT_WRITE, buf->dt_emask) ||
 +                      DMEV_ISSET(DM_EVENT_TRUNCATE, buf->dt_emask)) ? 1 : 0;
 +}
 +
 +/*
 + * Pull out both ondisk and incore fields, incore has preference.
 + * The inode must be kept locked SHARED by the caller.
 + */
 +STATIC void
 +xfs_ip_to_stat(
 +      xfs_mount_t             *mp,
 +      xfs_ino_t               ino,
 +      xfs_inode_t             *ip,
 +      dm_stat_t               *buf)
 +{
 +      xfs_icdinode_t          *dic = &ip->i_d;
 +
 +      buf->dt_ino = ino;
 +      buf->dt_nlink = dic->di_nlink;
 +      /*buf->dt_xfs_projid = dic->di_projid;*/
 +      buf->dt_mode = dic->di_mode;
 +      buf->dt_uid = dic->di_uid;
 +      buf->dt_gid = dic->di_gid;
 +      buf->dt_size = XFS_ISIZE(ip);
 +      buf->dt_dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
 +      buf->dt_atime = VFS_I(ip)->i_atime.tv_sec;
 +      buf->dt_mtime = dic->di_mtime.t_sec;
 +      buf->dt_ctime = dic->di_ctime.t_sec;
 +      buf->dt_xfs_xflags = xfs_ip2dmflags(ip);
 +      buf->dt_xfs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
 +      buf->dt_xfs_extents = dic->di_nextents;
 +      buf->dt_xfs_aextents = dic->di_anextents;
 +      buf->dt_xfs_igen = dic->di_gen;
 +      buf->dt_xfs_dmstate = dic->di_dmstate;
 +
 +      switch (dic->di_format) {
 +      case XFS_DINODE_FMT_DEV:
 +              buf->dt_rdev = ip->i_df.if_u2.if_rdev;
 +              buf->dt_blksize = BLKDEV_IOSIZE;
 +              buf->dt_blocks = 0;
 +              break;
 +      case XFS_DINODE_FMT_LOCAL:
 +      case XFS_DINODE_FMT_UUID:
 +              buf->dt_rdev = 0;
 +              buf->dt_blksize = mp->m_sb.sb_blocksize;
 +              buf->dt_blocks = 0;
 +              break;
 +      case XFS_DINODE_FMT_EXTENTS:
 +      case XFS_DINODE_FMT_BTREE:
 +              buf->dt_rdev = 0;
 +              buf->dt_blksize = mp->m_sb.sb_blocksize;
 +              buf->dt_blocks = XFS_FSB_TO_BB(mp,
 +                              (dic->di_nblocks + ip->i_delayed_blks));
 +              break;
 +      }
 +
 +      memset(&buf->dt_pad1, 0, sizeof(buf->dt_pad1));
 +      memset(&buf->dt_pad2, 0, sizeof(buf->dt_pad2));
 +      memset(&buf->dt_pad3, 0, sizeof(buf->dt_pad3));
 +
 +      /* Finally fill in the DMAPI specific fields */
 +      buf->dt_pers = 0;
 +      buf->dt_change = 0;
 +      buf->dt_nevents = DM_EVENT_MAX;
 +      buf->dt_emask = dic->di_dmevmask;
 +      buf->dt_dtime = dic->di_ctime.t_sec;
 +      /* Set if one of READ, WRITE or TRUNCATE bits is set in emask */
 +      buf->dt_pmanreg = (DMEV_ISSET(DM_EVENT_READ, buf->dt_emask) ||
 +                      DMEV_ISSET(DM_EVENT_WRITE, buf->dt_emask) ||
 +                      DMEV_ISSET(DM_EVENT_TRUNCATE, buf->dt_emask)) ? 1 : 0;
 +}
 +
 +/*
 + * Take the handle and put it at the end of a dm_xstat buffer.
 + * dt_compname is unused in bulkstat - so we zero it out.
 + * Finally, update link in dm_xstat_t to point to next struct.
 + */
 +STATIC void
 +xfs_dm_handle_to_xstat(
 +      dm_xstat_t      *xbuf,
 +      size_t          xstat_sz,
 +      dm_handle_t     *handle,
 +      size_t          handle_sz)
 +{
 +      dm_stat_t       *sbuf = &xbuf->dx_statinfo;
 +
 +      memcpy(xbuf + 1, handle, handle_sz);
 +      sbuf->dt_handle.vd_offset = (ssize_t) sizeof(dm_xstat_t);
 +      sbuf->dt_handle.vd_length = (size_t) DM_HSIZE(*handle);
 +      memset(&sbuf->dt_compname, 0, sizeof(dm_vardata_t));
 +      sbuf->_link = xstat_sz;
 +}
 +
 +STATIC int
 +xfs_dm_bulkall_iget_one(
 +      xfs_mount_t     *mp,
 +      xfs_ino_t       ino,
 +      xfs_daddr_t     bno,
 +      int             *value_lenp,
 +      dm_xstat_t      *xbuf,
 +      u_int           *xstat_szp,
 +      char            *attr_name,
 +      caddr_t         attr_buf)
 +{
 +      xfs_inode_t     *ip;
 +      dm_handle_t     handle;
 +      u_int           xstat_sz = *xstat_szp;
 +      int             value_len = *value_lenp;
 +      int             error;
 +
 +      error = xfs_iget(mp, NULL, ino,
 +                       XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
 +      if (error)
 +              return error;
 +
 +      xfs_ip_to_stat(mp, ino, ip, &xbuf->dx_statinfo);
 +      dm_ip_to_handle(&ip->i_vnode, &handle);
 +      xfs_dm_handle_to_xstat(xbuf, xstat_sz, &handle, sizeof(handle));
 +
 +      /* Drop ILOCK_SHARED for call to xfs_attr_get */
 +      xfs_iunlock(ip, XFS_ILOCK_SHARED);
 +
 +      memset(&xbuf->dx_attrdata, 0, sizeof(dm_vardata_t));
 +      error = xfs_attr_get(ip, attr_name, attr_buf, &value_len, ATTR_ROOT);
 +      iput(&ip->i_vnode);
 +
 +      DM_EA_XLATE_ERR(error);
 +      if (error && (error != ENOATTR)) {
 +              if (error == E2BIG)
 +                      error = ENOMEM;
 +              return error;
 +      }
 +
 +      /* How much space was in the attr? */
 +      if (error != ENOATTR) {
 +              xbuf->dx_attrdata.vd_offset = xstat_sz;
 +              xbuf->dx_attrdata.vd_length = value_len;
 +              xstat_sz += (value_len+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
 +      }
 +      *xstat_szp = xbuf->dx_statinfo._link = xstat_sz;
 +      *value_lenp = value_len;
 +      return 0;
 +}
 +
 +
 +STATIC int
 +xfs_dm_inline_attr(
 +      xfs_mount_t     *mp,
 +      xfs_dinode_t    *dip,
 +      char            *attr_name,
 +      caddr_t         attr_buf,
 +      int             *value_lenp)
 +{
 +      if (dip->di_aformat == XFS_DINODE_FMT_LOCAL) {
 +              xfs_attr_shortform_t    *sf;
 +              xfs_attr_sf_entry_t     *sfe;
 +              unsigned int            namelen = strlen(attr_name);
 +              unsigned int            valuelen = *value_lenp;
 +              int                     i;
 +
 +              sf = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 +              sfe = &sf->list[0];
 +              for (i = 0; i < sf->hdr.count;
 +                              sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
 +                      if (sfe->namelen != namelen)
 +                              continue;
 +                      if (!(sfe->flags & XFS_ATTR_ROOT))
 +                              continue;
 +                      if (memcmp(attr_name, sfe->nameval, namelen) != 0)
 +                              continue;
 +                      if (valuelen < sfe->valuelen)
 +                              return ERANGE;
 +                      valuelen = sfe->valuelen;
 +                      memcpy(attr_buf, &sfe->nameval[namelen], valuelen);
 +                      *value_lenp = valuelen;
 +                      return 0;
 +              }
 +      }
 +      *value_lenp = 0;
 +      return ENOATTR;
 +}
 +
 +STATIC void
 +dm_dip_to_handle(
 +      xfs_ino_t       ino,
 +      xfs_dinode_t    *dip,
 +      dm_fsid_t       *fsid,
 +      dm_handle_t     *handlep)
 +{
 +      dm_fid_t        fid;
 +      int             hsize;
 +
 +      fid.dm_fid_len = sizeof(struct dm_fid) - sizeof(fid.dm_fid_len);
 +      fid.dm_fid_pad = 0;
 +      fid.dm_fid_ino = ino;
 +      fid.dm_fid_gen = be32_to_cpu(dip->di_gen);
 +
 +      memcpy(&handlep->ha_fsid, fsid, sizeof(*fsid));
 +      memcpy(&handlep->ha_fid, &fid, fid.dm_fid_len + sizeof(fid.dm_fid_len));
 +      hsize = DM_HSIZE(*handlep);
 +      memset((char *)handlep + hsize, 0, sizeof(*handlep) - hsize);
 +}
 +
 +STATIC int
 +xfs_dm_bulkall_inline_one(
 +      xfs_mount_t     *mp,
 +      xfs_ino_t       ino,
 +      xfs_dinode_t    *dip,
 +      dm_fsid_t       *fsid,
 +      int             *value_lenp,
 +      dm_xstat_t      *xbuf,
 +      u_int           *xstat_szp,
 +      char            *attr_name,
 +      caddr_t         attr_buf)
 +{
 +      dm_handle_t     handle;
 +      u_int           xstat_sz = *xstat_szp;
 +      int             value_len = *value_lenp;
 +      int             error;
 +
 +      if (dip->di_mode == 0)
 +              return ENOENT;
 +
 +      xfs_dip_to_stat(mp, ino, dip, &xbuf->dx_statinfo);
 +      dm_dip_to_handle(ino, dip, fsid, &handle);
 +      xfs_dm_handle_to_xstat(xbuf, xstat_sz, &handle, sizeof(handle));
 +
 +      memset(&xbuf->dx_attrdata, 0, sizeof(dm_vardata_t));
 +      error = xfs_dm_inline_attr(mp, dip, attr_name, attr_buf, &value_len);
 +      DM_EA_XLATE_ERR(error);
 +      if (error && (error != ENOATTR)) {
 +              if (error == E2BIG)
 +                      error = ENOMEM;
 +              return error;
 +      }
 +
 +      /* How much space was in the attr? */
 +      if (error != ENOATTR) {
 +              xbuf->dx_attrdata.vd_offset = xstat_sz;
 +              xbuf->dx_attrdata.vd_length = value_len;
 +              xstat_sz += (value_len+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
 +      }
 +      *xstat_szp = xbuf->dx_statinfo._link = xstat_sz;
 +      *value_lenp = value_len;
 +      return 0;
 +}
 +
 +/*
 + * This is used by dm_get_bulkall().
 + * Given a inumber, it igets the inode and fills the given buffer
 + * with the dm_xstat structure for the file.
 + */
 +STATIC int
 +xfs_dm_bulkall_one(
 +      xfs_mount_t     *mp,            /* mount point for filesystem */
 +      xfs_ino_t       ino,            /* inode number to get data for */
 +      void            __user *buffer, /* buffer to place output in */
 +      int             ubsize,         /* size of buffer */
 +      void            *private_data,  /* my private data */
 +      xfs_daddr_t     bno,            /* starting block of inode cluster */
 +      int             *ubused,        /* amount of buffer we used */
 +      void            *dibuff,        /* on-disk inode buffer */
 +      int             *res)           /* bulkstat result code */
 +{
 +      dm_xstat_t      *xbuf;
 +      u_int           xstat_sz;
 +      int             error;
 +      int             value_len;
 +      int             kern_buf_sz;
 +      int             attr_buf_sz;
 +      caddr_t         attr_buf;
 +      void __user     *attr_user_buf;
 +      dm_bulkstat_one_t *dmb = (dm_bulkstat_one_t*)private_data;
 +
 +      /* Returns positive errors to XFS */
 +
 +      *res = BULKSTAT_RV_NOTHING;
 +
 +      if (!buffer || xfs_internal_inum(mp, ino))
 +              return EINVAL;
 +
 +      xstat_sz = DM_STAT_SIZE(*xbuf, 0);
 +      xstat_sz = (xstat_sz + (DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
 +      if (xstat_sz > ubsize)
 +              return ENOMEM;
 +
 +      kern_buf_sz = xstat_sz;
 +      xbuf = kmem_alloc(kern_buf_sz, KM_SLEEP);
 +
 +      /* Determine place to drop attr value, and available space. */
 +      value_len = ubsize - xstat_sz;
 +      if (value_len > ATTR_MAX_VALUELEN)
 +              value_len = ATTR_MAX_VALUELEN;
 +
 +      attr_user_buf = buffer + xstat_sz;
 +      attr_buf_sz = value_len;
 +      attr_buf = kmem_alloc(attr_buf_sz, KM_SLEEP);
 +
 +      if (!dibuff)
 +              error = xfs_dm_bulkall_iget_one(mp, ino, bno,
 +                                              &value_len, xbuf, &xstat_sz,
 +                                              dmb->attrname.dan_chars,
 +                                              attr_buf);
 +      else
 +              error = xfs_dm_bulkall_inline_one(mp, ino,
 +                                                (xfs_dinode_t *)dibuff,
 +                                                &dmb->fsid,
 +                                                &value_len, xbuf, &xstat_sz,
 +                                                dmb->attrname.dan_chars,
 +                                                attr_buf);
 +      if (error)
 +              goto out_free_buffers;
 +
 +      if (copy_to_user(buffer, xbuf, kern_buf_sz)) {
 +              error = EFAULT;
 +              goto out_free_buffers;
 +      }
 +      if (copy_to_user(attr_user_buf, attr_buf, value_len)) {
 +              error = EFAULT;
 +              goto out_free_buffers;
 +      }
 +
 +      kmem_free(attr_buf);
 +      kmem_free(xbuf);
 +
 +      *res = BULKSTAT_RV_DIDONE;
 +      if (ubused)
 +              *ubused = xstat_sz;
 +      dmb->laststruct = buffer;
 +      return 0;
 +
 + out_free_buffers:
 +      kmem_free(attr_buf);
 +      kmem_free(xbuf);
 +      return error;
 +}
 +
 +/*
 + * Take the handle and put it at the end of a dm_stat buffer.
 + * dt_compname is unused in bulkstat - so we zero it out.
 + * Finally, update link in dm_stat_t to point to next struct.
 + */
 +STATIC void
 +xfs_dm_handle_to_stat(
 +      dm_stat_t       *sbuf,
 +      size_t          stat_sz,
 +      dm_handle_t     *handle,
 +      size_t          handle_sz)
 +{
 +      memcpy(sbuf + 1, handle, handle_sz);
 +      sbuf->dt_handle.vd_offset = (ssize_t) sizeof(dm_stat_t);
 +      sbuf->dt_handle.vd_length = (size_t) DM_HSIZE(*handle);
 +      memset(&sbuf->dt_compname, 0, sizeof(dm_vardata_t));
 +      sbuf->_link = stat_sz;
 +}
 +
 +STATIC int
 +xfs_dm_bulkattr_iget_one(
 +      xfs_mount_t     *mp,
 +      xfs_ino_t       ino,
 +      xfs_daddr_t     bno,
 +      dm_stat_t       *sbuf,
 +      u_int           stat_sz)
 +{
 +      xfs_inode_t     *ip;
 +      dm_handle_t     handle;
 +      int             error;
 +
 +      error = xfs_iget(mp, NULL, ino,
 +                       XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
 +      if (error)
 +              return error;
 +
 +      xfs_ip_to_stat(mp, ino, ip, sbuf);
 +      dm_ip_to_handle(&ip->i_vnode, &handle);
 +      xfs_dm_handle_to_stat(sbuf, stat_sz, &handle, sizeof(handle));
 +
 +      xfs_iput(ip, XFS_ILOCK_SHARED);
 +      return 0;
 +}
 +
 +STATIC int
 +xfs_dm_bulkattr_inline_one(
 +      xfs_mount_t     *mp,
 +      xfs_ino_t       ino,
 +      xfs_dinode_t    *dip,
 +      dm_fsid_t       *fsid,
 +      dm_stat_t       *sbuf,
 +      u_int           stat_sz)
 +{
 +      dm_handle_t     handle;
 +
 +      if (dip->di_mode == 0)
 +              return ENOENT;
 +      xfs_dip_to_stat(mp, ino, dip, sbuf);
 +      dm_dip_to_handle(ino, dip, fsid, &handle);
 +      xfs_dm_handle_to_stat(sbuf, stat_sz, &handle, sizeof(handle));
 +      return 0;
 +}
 +
 +/*
 + * This is used by dm_get_bulkattr().
 + * Given a inumber, it igets the inode and fills the given buffer
 + * with the dm_stat structure for the file.
 + */
 +STATIC int
 +xfs_dm_bulkattr_one(
 +      xfs_mount_t     *mp,            /* mount point for filesystem */
 +      xfs_ino_t       ino,            /* inode number to get data for */
 +      void            __user *buffer, /* buffer to place output in */
 +      int             ubsize,         /* size of buffer */
 +      void            *private_data,  /* my private data */
 +      xfs_daddr_t     bno,            /* starting block of inode cluster */
 +      int             *ubused,        /* amount of buffer we used */
 +      void            *dibuff,        /* on-disk inode buffer */
 +      int             *res)           /* bulkstat result code */
 +{
 +      dm_stat_t       *sbuf;
 +      u_int           stat_sz;
 +      int             error;
 +      dm_bulkstat_one_t *dmb = (dm_bulkstat_one_t*)private_data;
 +
 +      /* Returns positive errors to XFS */
 +
 +      *res = BULKSTAT_RV_NOTHING;
 +
 +      if (!buffer || xfs_internal_inum(mp, ino))
 +              return EINVAL;
 +
 +      stat_sz = DM_STAT_SIZE(*sbuf, 0);
 +      stat_sz = (stat_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
 +      if (stat_sz > ubsize)
 +              return ENOMEM;
 +
 +      sbuf = kmem_alloc(stat_sz, KM_SLEEP);
 +
 +      if (!dibuff)
 +              error = xfs_dm_bulkattr_iget_one(mp, ino, bno, sbuf, stat_sz);
 +      else
 +              error = xfs_dm_bulkattr_inline_one(mp, ino,
 +                                                 (xfs_dinode_t *)dibuff,
 +                                                 &dmb->fsid, sbuf, stat_sz);
 +      if (error)
 +              goto out_free_buffer;
 +
 +      if (copy_to_user(buffer, sbuf, stat_sz)) {
 +              error = EFAULT;
 +              goto out_free_buffer;
 +      }
 +
 +      kmem_free(sbuf);
 +      *res = BULKSTAT_RV_DIDONE;
 +      if (ubused)
 +              *ubused = stat_sz;
 +      dmb->laststruct = buffer;
 +      return 0;
 +
 + out_free_buffer:
 +      kmem_free(sbuf);
 +      return error;
 +}
 +
 +/* xfs_dm_f_get_eventlist - return the dm_eventset_t mask for inode ip. */
 +
 +STATIC int
 +xfs_dm_f_get_eventlist(
 +      xfs_inode_t     *ip,
 +      dm_right_t      right,
 +      u_int           nelem,
 +      dm_eventset_t   *eventsetp,             /* in kernel space! */
 +      u_int           *nelemp)                /* in kernel space! */
 +{
 +      dm_eventset_t   eventset;
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(EACCES);
 +
 +      /* Note that we MUST return a regular file's managed region bits as
 +         part of the mask because dm_get_eventlist is supposed to return the
 +         union of all managed region flags in those bits.  Since we only
 +         support one region, we can just return the bits as they are.  For
 +         all other object types, the bits will already be zero.  Handy, huh?
 +      */
 +
 +      eventset = ip->i_d.di_dmevmask;
 +
 +      /* Now copy the event mask and event count back to the caller.  We
 +         return the lesser of nelem and DM_EVENT_MAX.
 +      */
 +
 +      if (nelem > DM_EVENT_MAX)
 +              nelem = DM_EVENT_MAX;
 +      eventset &= (1 << nelem) - 1;
 +
 +      *eventsetp = eventset;
 +      *nelemp = nelem;
 +      return(0);
 +}
 +
 +
 +/* xfs_dm_f_set_eventlist - update the dm_eventset_t mask in the inode vp.  Only the
 +   bits from zero to maxevent-1 are being replaced; higher bits are preserved.
 +*/
 +
 +STATIC int
 +xfs_dm_f_set_eventlist(
 +      xfs_inode_t     *ip,
 +      dm_right_t      right,
 +      dm_eventset_t   *eventsetp,     /* in kernel space! */
 +      u_int           maxevent)
 +{
 +      dm_eventset_t   eventset;
 +      dm_eventset_t   max_mask;
 +      dm_eventset_t   valid_events;
 +      xfs_trans_t     *tp;
 +      xfs_mount_t     *mp;
 +      int             error;
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return(EACCES);
 +
 +      eventset = *eventsetp;
 +      if (maxevent >= sizeof(ip->i_d.di_dmevmask) * NBBY)
 +              return(EINVAL);
 +      max_mask = (1 << maxevent) - 1;
 +
 +      if (S_ISDIR(ip->i_d.di_mode)) {
 +              valid_events = DM_XFS_VALID_DIRECTORY_EVENTS;
 +      } else {        /* file or symlink */
 +              valid_events = DM_XFS_VALID_FILE_EVENTS;
 +      }
 +      if ((eventset & max_mask) & ~valid_events)
 +              return(EINVAL);
 +
 +      /* Adjust the event mask so that the managed region bits will not
 +         be altered.
 +      */
 +
 +      max_mask &= ~(1 <<DM_EVENT_READ);       /* preserve current MR bits */
 +      max_mask &= ~(1 <<DM_EVENT_WRITE);
 +      max_mask &= ~(1 <<DM_EVENT_TRUNCATE);
 +
 +      mp = ip->i_mount;
 +      tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
 +      error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
 +      if (error) {
 +              xfs_trans_cancel(tp, 0);
 +              return(error);
 +      }
 +      xfs_ilock(ip, XFS_ILOCK_EXCL);
 +      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 +
 +      ip->i_d.di_dmevmask = (eventset & max_mask) | (ip->i_d.di_dmevmask & ~max_mask);
 +
 +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 +      igrab(&ip->i_vnode);
 +      xfs_trans_commit(tp, 0);
 +
 +      return(0);
 +}
 +
 +
 +/* xfs_dm_fs_get_eventlist - return the dm_eventset_t mask for filesystem vfsp. */
 +
 +STATIC int
 +xfs_dm_fs_get_eventlist(
 +      xfs_mount_t     *mp,
 +      dm_right_t      right,
 +      u_int           nelem,
 +      dm_eventset_t   *eventsetp,             /* in kernel space! */
 +      u_int           *nelemp)                /* in kernel space! */
 +{
 +      dm_eventset_t   eventset;
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(EACCES);
 +
 +      eventset = mp->m_dmevmask;
 +
 +      /* Now copy the event mask and event count back to the caller.  We
 +         return the lesser of nelem and DM_EVENT_MAX.
 +      */
 +
 +      if (nelem > DM_EVENT_MAX)
 +              nelem = DM_EVENT_MAX;
 +      eventset &= (1 << nelem) - 1;
 +
 +      *eventsetp = eventset;
 +      *nelemp = nelem;
 +      return(0);
 +}
 +
 +
 +/* xfs_dm_fs_set_eventlist - update the dm_eventset_t mask in the mount structure for
 +   filesystem vfsp.  Only the bits from zero to maxevent-1 are being replaced;
 +   higher bits are preserved.
 +*/
 +
 +STATIC int
 +xfs_dm_fs_set_eventlist(
 +      xfs_mount_t     *mp,
 +      dm_right_t      right,
 +      dm_eventset_t   *eventsetp,     /* in kernel space! */
 +      u_int           maxevent)
 +{
 +      dm_eventset_t   eventset;
 +      dm_eventset_t   max_mask;
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return(EACCES);
 +
 +      eventset = *eventsetp;
 +
 +      if (maxevent >= sizeof(mp->m_dmevmask) * NBBY)
 +              return(EINVAL);
 +      max_mask = (1 << maxevent) - 1;
 +
 +      if ((eventset & max_mask) & ~DM_XFS_VALID_FS_EVENTS)
 +              return(EINVAL);
 +
 +      mp->m_dmevmask = (eventset & max_mask) | (mp->m_dmevmask & ~max_mask);
 +      return(0);
 +}
 +
 +
 +/* Code in this routine must exactly match the logic in xfs_diordwr() in
 +   order for this to work!
 +*/
 +
 +STATIC int
 +xfs_dm_direct_ok(
 +      xfs_inode_t     *ip,
 +      dm_off_t        off,
 +      dm_size_t       len,
 +      void            __user *bufp)
 +{
 +      xfs_mount_t     *mp;
 +
 +      mp = ip->i_mount;
 +
 +      /* Realtime files can ONLY do direct I/O. */
 +
 +      if (XFS_IS_REALTIME_INODE(ip))
 +              return(1);
 +
 +      /* If direct I/O is disabled, or if the request is too small, use
 +         buffered I/O.
 +      */
 +
 +      if (!dm_min_dio_xfer || len < dm_min_dio_xfer)
 +              return(0);
 +
 +#if 0
 +      /* If the request is not well-formed or is too large, use
 +         buffered I/O.
 +      */
 +
 +      if ((__psint_t)bufp & scache_linemask)  /* if buffer not aligned */
 +              return(0);
 +      if (off & mp->m_blockmask)              /* if file offset not aligned */
 +              return(0);
 +      if (len & mp->m_blockmask)              /* if xfer length not aligned */
 +              return(0);
 +      if (len > ctooff(v.v_maxdmasz - 1))     /* if transfer too large */
 +              return(0);
 +
 +      /* A valid direct I/O candidate. */
 +
 +      return(1);
 +#else
 +      return(0);
 +#endif
 +}
 +
 +
 +/* We need to be able to select various combinations of O_NONBLOCK,
 +   O_DIRECT, and O_SYNC, yet we don't have a file descriptor and we don't have
 +   the file's pathname.        All we have is a handle.
 +*/
 +
 +STATIC int
 +xfs_dm_rdwr(
 +      struct inode    *inode,
 +      uint            fflag,
 +      mode_t          fmode,
 +      dm_off_t        off,
 +      dm_size_t       len,
 +      void            __user *bufp,
 +      int             *rvp)
 +{
 +      const struct cred *cred = current_cred();
 +      xfs_inode_t     *ip = XFS_I(inode);
 +      int             error;
 +      int             oflags;
 +      ssize_t         xfer;
 +      struct file     *file;
 +      struct dentry   *dentry;
 +
 +      if ((off < 0) || (off > i_size_read(inode)) || !S_ISREG(inode->i_mode))
 +              return EINVAL;
 +
 +      if (fmode & FMODE_READ) {
 +              oflags = O_RDONLY;
 +      } else {
 +              oflags = O_WRONLY;
 +      }
 +
 +      /*
 +       * Build file descriptor flags and I/O flags.  O_NONBLOCK is needed so
 +       * that we don't block on mandatory file locks. This is an invisible IO,
 +       * don't change the atime.
 +       */
 +
 +      oflags |= O_LARGEFILE | O_NONBLOCK | O_NOATIME;
 +      if (xfs_dm_direct_ok(ip, off, len, bufp))
 +              oflags |= O_DIRECT;
 +
 +      if (fflag & O_SYNC)
 +              oflags |= O_SYNC;
 +
 +      if (inode->i_fop == NULL) {
 +              /* no iput; caller did get, and will do put */
 +              return EINVAL;
 +      }
 +
 +      igrab(inode);
 +
 +      dentry = d_obtain_alias(inode);
 +      if (dentry == NULL) {
 +              iput(inode);
 +              return ENOMEM;
 +      }
 +
 +      file = dentry_open(dentry, mntget(ip->i_mount->m_vfsmount), oflags,
 +                         cred);
 +      if (IS_ERR(file)) {
 +              return -PTR_ERR(file);
 +      }
 +      file->f_mode |= FMODE_NOCMTIME;
 +
 +      if (fmode & FMODE_READ) {
 +              xfer = file->f_op->read(file, bufp, len, (loff_t*)&off);
 +      } else {
 +              xfer = file->f_op->write(file, bufp, len, (loff_t*)&off);
 +      }
 +
 +      if (xfer >= 0) {
 +              *rvp = xfer;
 +              error = 0;
 +      } else {
 +              /* xfs_read/xfs_write return negative error--flip it */
 +              error = -(int)xfer;
 +      }
 +
 +      fput(file);
 +      return error;
 +}
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_clear_inherit(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_attrname_t   __user *attrnamep)
 +{
 +      return(-ENOSYS); /* Return negative error to DMAPI */
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_create_by_handle(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      void            __user *hanp,
 +      size_t          hlen,
 +      char            __user *cname)
 +{
 +      return(-ENOSYS); /* Return negative error to DMAPI */
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_downgrade_right(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           type)           /* DM_FSYS_OBJ or zero */
 +{
 +#ifdef        DEBUG_RIGHTS
 +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
 +
 +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
 +              printf("dm_downgrade_right: old %d new %d type %d handle %s\n",
 +                      right, DM_RIGHT_SHARED, type, buffer);
 +      } else {
 +              printf("dm_downgrade_right: old %d new %d type %d handle "
 +                      "<INVALID>\n", right, DM_RIGHT_SHARED, type);
 +      }
 +#endif        /* DEBUG_RIGHTS */
 +      return(0);
 +}
 +
 +
 +/* Note: xfs_dm_get_allocinfo() makes no attempt to coalesce two adjacent
 +   extents when both are of type DM_EXTENT_RES; this is left to the caller.
 +   XFS guarantees that there will never be two adjacent DM_EXTENT_HOLE extents.
 +
 +   In order to provide the caller with all extents in a file including
 +   those beyond the file's last byte offset, we have to use the xfs_bmapi()
 +   interface.
 +*/
 +
 +STATIC int
 +xfs_dm_get_allocinfo_rvp(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_off_t        __user  *offp,
 +      u_int           nelem,
 +      dm_extent_t     __user *extentp,
 +      u_int           __user *nelemp,
 +      int             *rvp)
 +{
 +      xfs_inode_t     *ip = XFS_I(inode);
 +      xfs_mount_t     *mp;            /* file system mount point */
 +      xfs_fileoff_t   fsb_offset;
 +      xfs_filblks_t   fsb_length;
 +      dm_off_t        startoff;
 +      int             elem;
 +      xfs_bmbt_irec_t *bmp = NULL;
 +      u_int           bmpcnt = 50;
 +      u_int           bmpsz = sizeof(xfs_bmbt_irec_t) * bmpcnt;
 +      int             error = 0;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      if ((inode->i_mode & S_IFMT) != S_IFREG)
 +              return(-EINVAL);
 +
 +      if (copy_from_user( &startoff, offp, sizeof(startoff)))
 +              return(-EFAULT);
 +
 +      mp = ip->i_mount;
 +      ASSERT(mp);
 +
 +      if (startoff > XFS_MAXIOFFSET(mp))
 +              return(-EINVAL);
 +
 +      if (nelem == 0)
 +              return(-EINVAL);
 +
 +      /* Convert the caller's starting offset into filesystem allocation
 +         units as required by xfs_bmapi().  Round the offset down so that
 +         it is sure to be included in the reply.
 +      */
 +
 +      fsb_offset = XFS_B_TO_FSBT(mp, startoff);
 +      fsb_length = XFS_B_TO_FSB(mp, XFS_MAXIOFFSET(mp)) - fsb_offset;
 +      elem = 0;
 +
 +      if (fsb_length)
 +              bmp = kmem_alloc(bmpsz, KM_SLEEP);
 +
 +      while (fsb_length && elem < nelem) {
 +              dm_extent_t     extent;
 +              xfs_filblks_t   fsb_bias;
 +              dm_size_t       bias;
 +              int             lock;
 +              int             num;
 +              int             i;
 +
 +              /* Compute how many getbmap structures to use on the xfs_bmapi
 +                 call.
 +              */
 +
 +              num = MIN((u_int)(nelem - elem), bmpcnt);
 +
 +              xfs_ilock(ip, XFS_IOLOCK_SHARED);
 +              lock = xfs_ilock_map_shared(ip);
 +
 +              error = xfs_bmapi(NULL, ip, fsb_offset, fsb_length,
 +                      XFS_BMAPI_ENTIRE, NULL, 0, bmp, &num, NULL, NULL);
 +
 +              xfs_iunlock_map_shared(ip, lock);
 +              xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 +
 +              if (error) {
 +                      error = -error; /* Return negative error to DMAPI */
 +                      goto finish_out;
 +              }
 +
 +              /* Fill in the caller's extents, adjusting the bias in the
 +                 first entry if necessary.
 +              */
 +
 +              for (i = 0; i < num; i++, extentp++) {
 +                      bias = startoff - XFS_FSB_TO_B(mp, bmp[i].br_startoff);
 +                      extent.ex_offset = startoff;
 +                      extent.ex_length =
 +                              XFS_FSB_TO_B(mp, bmp[i].br_blockcount) - bias;
 +                      if (bmp[i].br_startblock == HOLESTARTBLOCK) {
 +                              extent.ex_type = DM_EXTENT_HOLE;
 +                      } else {
 +                              extent.ex_type = DM_EXTENT_RES;
 +                      }
 +                      startoff = extent.ex_offset + extent.ex_length;
 +
 +                      if (copy_to_user( extentp, &extent, sizeof(extent))) {
 +                              error = -EFAULT;
 +                              goto finish_out;
 +                      }
 +
 +                      fsb_bias = fsb_offset - bmp[i].br_startoff;
 +                      fsb_offset += bmp[i].br_blockcount - fsb_bias;
 +                      fsb_length -= bmp[i].br_blockcount - fsb_bias;
 +                      elem++;
 +              }
 +      }
 +
 +      if (fsb_length == 0) {
 +              startoff = 0;
 +      }
 +      if (copy_to_user( offp, &startoff, sizeof(startoff))) {
 +              error = -EFAULT;
 +              goto finish_out;
 +      }
 +
 +      if (copy_to_user( nelemp, &elem, sizeof(elem))) {
 +              error = -EFAULT;
 +              goto finish_out;
 +      }
 +
 +      *rvp = (fsb_length == 0 ? 0 : 1);
 +
 +finish_out:
 +      if (bmp)
 +              kmem_free(bmp);
 +      return(error);
 +}
 +
 +
 +STATIC int
 +xfs_dm_zero_xstatinfo_link(
 +      dm_xstat_t __user       *dxs)
 +{
 +      dm_xstat_t              *ldxs;
 +      int                     error = 0;
 +
 +      if (!dxs)
 +              return 0;
 +      ldxs = kmalloc(sizeof(*ldxs), GFP_KERNEL);
 +      if (!ldxs)
 +              return -ENOMEM;
 +      if (copy_from_user(ldxs, dxs, sizeof(*dxs))) {
 +              error = -EFAULT;
 +      } else {
 +              ldxs->dx_statinfo._link = 0;
 +              if (copy_to_user(dxs, ldxs, sizeof(*dxs)))
 +                      error = -EFAULT;
 +      }
 +      kfree(ldxs);
 +      return error;
 +}
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_get_bulkall_rvp(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           mask,
 +      dm_attrname_t   __user *attrnamep,
 +      dm_attrloc_t    __user *locp,
 +      size_t          buflen,
 +      void            __user *bufp,   /* address of buffer in user space */
 +      size_t          __user *rlenp,  /* user space address */
 +      int             *rvalp)
 +{
 +      int             error, done;
 +      int             nelems;
 +      u_int           statstruct_sz;
 +      dm_attrloc_t    loc;
 +      xfs_mount_t     *mp = XFS_I(inode)->i_mount;
 +      dm_attrname_t   attrname;
 +      dm_bulkstat_one_t dmb;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (copy_from_user(&attrname, attrnamep, sizeof(attrname)) ||
 +          copy_from_user(&loc, locp, sizeof(loc)))
 +              return -EFAULT;
 +
 +      if (attrname.an_chars[0] == '\0')
 +              return(-EINVAL);
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      /* Because we will write directly to the user's buffer, make sure that
 +         the buffer is properly aligned.
 +      */
 +
 +      if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
 +              return(-EFAULT);
 +
 +      /* Size of the handle is constant for this function.
 +       * If there are no files with attributes, then this will be the
 +       * maximum number of inodes we can get.
 +       */
 +
 +      statstruct_sz = DM_STAT_SIZE(dm_xstat_t, 0);
 +      statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
 +
 +      nelems = buflen / statstruct_sz;
 +      if (nelems < 1) {
 +              if (put_user( statstruct_sz, rlenp ))
 +                      return(-EFAULT);
 +              return(-E2BIG);
 +      }
 +
 +      /* Build the on-disk version of the attribute name. */
 +      strcpy(dmb.attrname.dan_chars, dmattr_prefix);
 +      strncpy(&dmb.attrname.dan_chars[DMATTR_PREFIXLEN],
 +              attrname.an_chars, DM_ATTR_NAME_SIZE + 1);
 +      dmb.attrname.dan_chars[sizeof(dmb.attrname.dan_chars) - 1] = '\0';
 +
 +      /*
 +       * fill the buffer with dm_xstat_t's
 +       */
 +
 +      dmb.laststruct = NULL;
 +      memcpy(&dmb.fsid, mp->m_fixedfsid, sizeof(dm_fsid_t));
 +      error = xfs_bulkstat(mp, (xfs_ino_t *)&loc, &nelems,
 +                           xfs_dm_bulkall_one, (void*)&dmb, statstruct_sz,
 +                           bufp, BULKSTAT_FG_INLINE, &done);
 +      if (error)
 +              return(-error); /* Return negative error to DMAPI */
 +
 +      *rvalp = !done ? 1 : 0;
 +
 +      if (put_user( statstruct_sz * nelems, rlenp ))
 +              return(-EFAULT);
 +
 +      if (copy_to_user( locp, &loc, sizeof(loc)))
 +              return(-EFAULT);
 +      /*
 +       *  If we didn't do any, we must not have any more to do.
 +       */
 +      if (nelems < 1)
 +              return(0);
 +      /*
 +       * Set _link in the last struct to zero
 +       */
 +      return xfs_dm_zero_xstatinfo_link((dm_xstat_t __user *)dmb.laststruct);
 +}
 +
 +
 +STATIC int
 +xfs_dm_zero_statinfo_link(
 +      dm_stat_t __user        *dxs)
 +{
 +      dm_stat_t               *ldxs;
 +      int                     error = 0;
 +
 +      if (!dxs)
 +              return 0;
 +      ldxs = kmalloc(sizeof(*ldxs), GFP_KERNEL);
 +      if (!ldxs)
 +              return -ENOMEM;
 +      if (copy_from_user(ldxs, dxs, sizeof(*dxs))) {
 +              error = -EFAULT;
 +      } else {
 +              ldxs->_link = 0;
 +              if (copy_to_user(dxs, ldxs, sizeof(*dxs)))
 +                      error = -EFAULT;
 +      }
 +      kfree(ldxs);
 +      return error;
 +}
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_get_bulkattr_rvp(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           mask,
 +      dm_attrloc_t    __user *locp,
 +      size_t          buflen,
 +      void            __user *bufp,
 +      size_t          __user *rlenp,
 +      int             *rvalp)
 +{
 +      int             error, done;
 +      int             nelems;
 +      u_int           statstruct_sz;
 +      dm_attrloc_t    loc;
 +      xfs_mount_t     *mp = XFS_I(inode)->i_mount;
 +      dm_bulkstat_one_t dmb;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      if (copy_from_user( &loc, locp, sizeof(loc)))
 +              return(-EFAULT);
 +
 +      /* Because we will write directly to the user's buffer, make sure that
 +         the buffer is properly aligned.
 +      */
 +
 +      if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
 +              return(-EFAULT);
 +
 +      /* size of the handle is constant for this function */
 +
 +      statstruct_sz = DM_STAT_SIZE(dm_stat_t, 0);
 +      statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
 +
 +      nelems = buflen / statstruct_sz;
 +      if (nelems < 1) {
 +              if (put_user( statstruct_sz, rlenp ))
 +                      return(-EFAULT);
 +              return(-E2BIG);
 +      }
 +
 +      dmb.laststruct = NULL;
 +      memcpy(&dmb.fsid, mp->m_fixedfsid, sizeof(dm_fsid_t));
 +      error = xfs_bulkstat(mp, (xfs_ino_t *)&loc, &nelems,
 +                              xfs_dm_bulkattr_one, (void*)&dmb,
 +                              statstruct_sz, bufp, BULKSTAT_FG_INLINE, &done);
 +      if (error)
 +              return(-error); /* Return negative error to DMAPI */
 +
 +      *rvalp = !done ? 1 : 0;
 +
 +      if (put_user( statstruct_sz * nelems, rlenp ))
 +              return(-EFAULT);
 +
 +      if (copy_to_user( locp, &loc, sizeof(loc)))
 +              return(-EFAULT);
 +
 +      /*
 +       *  If we didn't do any, we must not have any more to do.
 +       */
 +      if (nelems < 1)
 +              return(0);
 +      /*
 +       * Set _link in the last struct to zero
 +       */
 +      return xfs_dm_zero_statinfo_link((dm_stat_t __user *)dmb.laststruct);
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_get_config(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_config_t     flagname,
 +      dm_size_t       __user *retvalp)
 +{
 +      dm_size_t       retval;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      switch (flagname) {
 +      case DM_CONFIG_DTIME_OVERLOAD:
 +      case DM_CONFIG_PERS_ATTRIBUTES:
 +      case DM_CONFIG_PERS_EVENTS:
 +      case DM_CONFIG_PERS_MANAGED_REGIONS:
 +      case DM_CONFIG_PUNCH_HOLE:
 +      case DM_CONFIG_WILL_RETRY:
 +              retval = DM_TRUE;
 +              break;
 +
 +      case DM_CONFIG_CREATE_BY_HANDLE:        /* these will never be done */
 +      case DM_CONFIG_LOCK_UPGRADE:
 +      case DM_CONFIG_PERS_INHERIT_ATTRIBS:
 +              retval = DM_FALSE;
 +              break;
 +
 +      case DM_CONFIG_BULKALL:
 +              retval = DM_TRUE;
 +              break;
 +      case DM_CONFIG_MAX_ATTR_ON_DESTROY:
 +              retval = DM_MAX_ATTR_BYTES_ON_DESTROY;
 +              break;
 +
 +      case DM_CONFIG_MAX_ATTRIBUTE_SIZE:
 +              retval = ATTR_MAX_VALUELEN;
 +              break;
 +
 +      case DM_CONFIG_MAX_HANDLE_SIZE:
 +              retval = DM_MAX_HANDLE_SIZE;
 +              break;
 +
 +      case DM_CONFIG_MAX_MANAGED_REGIONS:
 +              retval = 1;
 +              break;
 +
 +      case DM_CONFIG_TOTAL_ATTRIBUTE_SPACE:
 +              retval = 0x7fffffff;    /* actually it's unlimited */
 +              break;
 +
 +      default:
 +              return(-EINVAL);
 +      }
 +
 +      /* Copy the results back to the user. */
 +
 +      if (copy_to_user( retvalp, &retval, sizeof(retval)))
 +              return(-EFAULT);
 +      return(0);
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_get_config_events(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           nelem,
 +      dm_eventset_t   __user *eventsetp,
 +      u_int           __user *nelemp)
 +{
 +      dm_eventset_t   eventset;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (nelem == 0)
 +              return(-EINVAL);
 +
 +      eventset = DM_XFS_SUPPORTED_EVENTS;
 +
 +      /* Now copy the event mask and event count back to the caller.  We
 +         return the lesser of nelem and DM_EVENT_MAX.
 +      */
 +
 +      if (nelem > DM_EVENT_MAX)
 +              nelem = DM_EVENT_MAX;
 +      eventset &= (1 << nelem) - 1;
 +
 +      if (copy_to_user( eventsetp, &eventset, sizeof(eventset)))
 +              return(-EFAULT);
 +
 +      if (put_user(nelem, nelemp))
 +              return(-EFAULT);
 +      return(0);
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_get_destroy_dmattr(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_attrname_t  *attrnamep,
 +      char            **valuepp,
 +      int             *vlenp)
 +{
 +      dm_dkattrname_t dkattrname;
 +      int             alloc_size;
 +      int             value_len;
 +      char            *value;
 +      int             error;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      *vlenp = -1;            /* assume failure by default */
 +
 +      if (attrnamep->an_chars[0] == '\0')
 +              return(-EINVAL);
 +
 +      /* Build the on-disk version of the attribute name. */
 +
 +      strcpy(dkattrname.dan_chars, dmattr_prefix);
 +      strncpy(&dkattrname.dan_chars[DMATTR_PREFIXLEN],
 +              (char *)attrnamep->an_chars, DM_ATTR_NAME_SIZE + 1);
 +      dkattrname.dan_chars[sizeof(dkattrname.dan_chars) - 1] = '\0';
 +
 +      /* xfs_attr_get will not return anything if the buffer is too small,
 +         and we don't know how big to make the buffer, so this may take
 +         two tries to get it right.  The initial try must use a buffer of
 +         at least XFS_BUG_KLUDGE bytes to prevent buffer overflow because
 +         of a bug in XFS.
 +      */
 +
 +      alloc_size = XFS_BUG_KLUDGE;
 +      value = kmalloc(alloc_size, GFP_KERNEL);
 +      if (value == NULL)
 +              return(-ENOMEM);
 +
 +      error = xfs_attr_get(XFS_I(inode), dkattrname.dan_chars, value,
 +                                                      &value_len, ATTR_ROOT);
 +      if (error == ERANGE) {
 +              kfree(value);
 +              alloc_size = value_len;
 +              value = kmalloc(alloc_size, GFP_KERNEL);
 +              if (value == NULL)
 +                      return(-ENOMEM);
 +
 +              error = xfs_attr_get(XFS_I(inode), dkattrname.dan_chars, value,
 +                                      &value_len, ATTR_ROOT);
 +      }
 +      if (error) {
 +              kfree(value);
 +              DM_EA_XLATE_ERR(error);
 +              return(-error); /* Return negative error to DMAPI */
 +      }
 +
 +      /* The attribute exists and has a value.  Note that a value_len of
 +         zero is valid!
 +      */
 +
 +      if (value_len == 0) {
 +              kfree(value);
 +              *vlenp = 0;
 +              return(0);
 +      } else if (value_len > DM_MAX_ATTR_BYTES_ON_DESTROY) {
 +              char    *value2;
 +
 +              value2 = kmalloc(DM_MAX_ATTR_BYTES_ON_DESTROY, GFP_KERNEL);
 +              if (value2 == NULL) {
 +                      kfree(value);
 +                      return(-ENOMEM);
 +              }
 +              memcpy(value2, value, DM_MAX_ATTR_BYTES_ON_DESTROY);
 +              kfree(value);
 +              value = value2;
 +              value_len = DM_MAX_ATTR_BYTES_ON_DESTROY;
 +      }
 +      *vlenp = value_len;
 +      *valuepp = value;
 +      return(0);
 +}
 +
 +/* This code was taken from xfs_fcntl(F_DIOINFO) and modified slightly because
 +   we don't have a flags parameter (no open file).
 +   Taken from xfs_ioctl(XFS_IOC_DIOINFO) on Linux.
 +*/
 +
 +STATIC int
 +xfs_dm_get_dioinfo(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_dioinfo_t    __user *diop)
 +{
 +      dm_dioinfo_t    dio;
 +      xfs_mount_t     *mp;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      mp = ip->i_mount;
 +
 +      dio.d_miniosz = dio.d_mem = MIN_DIO_SIZE(mp);
 +      dio.d_maxiosz = MAX_DIO_SIZE(mp);
 +      dio.d_dio_only = DM_FALSE;
 +
 +      if (copy_to_user(diop, &dio, sizeof(dio)))
 +              return(-EFAULT);
 +      return(0);
 +}
 +
 +typedef struct dm_readdir_cb {
 +      xfs_mount_t             *mp;
 +      char __user             *ubuf;
 +      dm_stat_t __user        *lastbuf;
 +      size_t                  spaceleft;
 +      size_t                  nwritten;
 +      int                     error;
 +      dm_stat_t               kstat;
 +} dm_readdir_cb_t;
 +
 +STATIC int
 +dm_filldir(void *__buf, const char *name, int namelen, loff_t offset,
 +              u64 ino, unsigned int d_type)
 +{
 +      dm_readdir_cb_t *cb = __buf;
 +      dm_stat_t       *statp = &cb->kstat;
 +      size_t          len;
 +      int             error;
 +      int             needed;
 +
 +      /*
 +       * Make sure we have enough space.
 +       */
 +        needed = dm_stat_size(namelen + 1);
 +      if (cb->spaceleft < needed) {
 +              cb->spaceleft = 0;
 +              return -ENOSPC;
 +      }
 +
 +      error = -EINVAL;
 +      if (xfs_internal_inum(cb->mp, ino))
 +              goto out_err;
 +
 +      memset(statp, 0, dm_stat_size(MAXNAMLEN));
 +      error = -xfs_dm_bulkattr_iget_one(cb->mp, ino, 0,
 +                      statp, needed);
 +      if (error)
 +              goto out_err;
 +
 +      /*
 +       * On return from bulkstat_one(), stap->_link points
 +       * at the end of the handle in the stat structure.
 +       */
 +      statp->dt_compname.vd_offset = statp->_link;
 +      statp->dt_compname.vd_length = namelen + 1;
 +
 +      len = statp->_link;
 +
 +      /* Word-align the record */
 +      statp->_link = dm_stat_align(len + namelen + 1);
 +
 +      error = -EFAULT;
 +      if (copy_to_user(cb->ubuf, statp, len))
 +              goto out_err;
 +      if (copy_to_user(cb->ubuf + len, name, namelen))
 +              goto out_err;
 +      if (put_user(0, cb->ubuf + len + namelen))
 +              goto out_err;
 +
 +      cb->lastbuf = (dm_stat_t __user *)cb->ubuf;
 +      cb->spaceleft -= statp->_link;
 +      cb->nwritten += statp->_link;
 +      cb->ubuf += statp->_link;
 +
 +      return 0;
 +
 + out_err:
 +      cb->error = error;
 +      return error;
 +}
 +
 +/* Returns negative errors to DMAPI */
 +STATIC int
 +xfs_dm_get_dirattrs_rvp(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           mask,
 +      dm_attrloc_t    __user *locp,
 +      size_t          buflen,
 +      void            __user *bufp,
 +      size_t          __user *rlenp,
 +      int             *rvp)
 +{
 +      xfs_inode_t     *dp = XFS_I(inode);
 +      xfs_mount_t     *mp = dp->i_mount;
 +      dm_readdir_cb_t *cb;
 +      dm_attrloc_t    loc;
 +      int             error;
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return -EACCES;
 +
 +        /*
 +         * Make sure that the buffer is properly aligned.
 +         */
 +        if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
 +                return -EFAULT;
 +
 +      if (mask & ~(DM_AT_HANDLE|DM_AT_EMASK|DM_AT_PMANR|DM_AT_PATTR|
 +                   DM_AT_DTIME|DM_AT_CFLAG|DM_AT_STAT))
 +              return -EINVAL;
 +
 +      if (!S_ISDIR(inode->i_mode))
 +              return -EINVAL;
 +
 +        /*
 +         * bufp should be able to fit at least one dm_stat entry including
 +         * dt_handle and full size MAXNAMLEN dt_compname.
 +         */
 +        if (buflen < dm_stat_size(MAXNAMLEN))
 +                return -ENOMEM;
 +
 +      if (copy_from_user(&loc, locp, sizeof(loc)))
 +              return -EFAULT;
 +
 +      cb = kzalloc(sizeof(*cb) + dm_stat_size(MAXNAMLEN), GFP_KERNEL);
 +      if (!cb)
 +              return -ENOMEM;
 +
 +      cb->mp = mp;
 +      cb->spaceleft = buflen;
 +      cb->ubuf = bufp;
 +
 +      mutex_lock(&inode->i_mutex);
 +      error = -ENOENT;
 +      if (!IS_DEADDIR(inode)) {
 +              error = -xfs_readdir(dp, cb, dp->i_size,
 +                                       (xfs_off_t *)&loc, dm_filldir);
 +      }
 +      mutex_unlock(&inode->i_mutex);
 +
 +      if (error)
 +              goto out_kfree;
 +      if (cb->error) {
 +              error = cb->error;
 +              goto out_kfree;
 +      }
 +
 +      error = -EFAULT;
 +      if (cb->lastbuf && put_user(0, &cb->lastbuf->_link))
 +              goto out_kfree;
 +      if (put_user(cb->nwritten, rlenp))
 +              goto out_kfree;
 +      if (copy_to_user(locp, &loc, sizeof(loc)))
 +              goto out_kfree;
 +
 +      if (cb->nwritten)
 +              *rvp = 1;
 +      else
 +              *rvp = 0;
 +      error = 0;
 +
 + out_kfree:
 +      kfree(cb);
 +      return error;
 +}
 +
 +STATIC int
 +xfs_dm_get_dmattr(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_attrname_t   __user *attrnamep,
 +      size_t          buflen,
 +      void            __user *bufp,
 +      size_t          __user  *rlenp)
 +{
 +      dm_dkattrname_t name;
 +      char            *value;
 +      int             value_len;
 +      int             alloc_size;
 +      int             error;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
 +              return(-error); /* Return negative error to DMAPI */
 +
 +      /* Allocate a buffer to receive the attribute's value.  We allocate
 +         at least one byte even if the caller specified a buflen of zero.
 +         (A buflen of zero is considered valid.)
 +
 +         Allocating a minimum of XFS_BUG_KLUDGE bytes temporarily works
 +         around a bug within XFS in which in-inode attribute values are not
 +         checked to see if they will fit in the buffer before they are
 +         copied.  Since no in-core attribute value can be larger than 256
 +         bytes (an 8-bit size field), we allocate that minimum size here to
 +         prevent buffer overrun in both the kernel's and user's buffers.
 +      */
 +
 +      alloc_size = buflen;
 +      if (alloc_size < XFS_BUG_KLUDGE)
 +              alloc_size = XFS_BUG_KLUDGE;
 +      if (alloc_size > ATTR_MAX_VALUELEN)
 +              alloc_size = ATTR_MAX_VALUELEN;
-       value = kmem_alloc(alloc_size, KM_SLEEP | KM_LARGE);
++      value = kmem_zalloc_large(alloc_size);
 +
 +      /* Get the attribute's value. */
 +
 +      value_len = alloc_size;         /* in/out parameter */
 +
 +      error = xfs_attr_get(XFS_I(inode), name.dan_chars, value, &value_len,
 +                                      ATTR_ROOT);
 +      DM_EA_XLATE_ERR(error);
 +
 +      /* DMAPI requires an errno of ENOENT if an attribute does not exist,
 +         so remap ENOATTR here.
 +      */
 +
 +      if (error == ENOATTR)
 +              error = ENOENT;
 +      if (!error && value_len > buflen)
 +              error = E2BIG;
 +      if (!error && copy_to_user(bufp, value, value_len))
 +              error = EFAULT;
 +      if (!error || error == E2BIG) {
 +              if (put_user(value_len, rlenp))
 +                      error = EFAULT;
 +      }
 +
 +      kmem_free(value);
 +      return(-error); /* Return negative error to DMAPI */
 +}
 +
 +STATIC int
 +xfs_dm_get_eventlist(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           type,
 +      u_int           nelem,
 +      dm_eventset_t   *eventsetp,
 +      u_int           *nelemp)
 +{
 +      int             error;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (type == DM_FSYS_OBJ) {
 +              error = xfs_dm_fs_get_eventlist(ip->i_mount, right, nelem,
 +                      eventsetp, nelemp);
 +      } else {
 +              error = xfs_dm_f_get_eventlist(ip, right, nelem,
 +                      eventsetp, nelemp);
 +      }
 +      return(-error); /* Returns negative error to DMAPI */
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_get_fileattr(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           mask,           /* not used; always return everything */
 +      dm_stat_t       __user *statp)
 +{
 +      dm_stat_t       stat;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +      xfs_mount_t     *mp;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      /* Find the mount point. */
 +
 +      mp = ip->i_mount;
 +
 +      xfs_ilock(ip, XFS_ILOCK_SHARED);
 +      xfs_ip_to_stat(mp, ip->i_ino, ip, &stat);
 +      xfs_iunlock(ip, XFS_ILOCK_SHARED);
 +
 +      if (copy_to_user( statp, &stat, sizeof(stat)))
 +              return(-EFAULT);
 +      return(0);
 +}
 +
 +
 +/* We currently only support a maximum of one managed region per file, and
 +   use the DM_EVENT_READ, DM_EVENT_WRITE, and DM_EVENT_TRUNCATE events in
 +   the file's dm_eventset_t event mask to implement the DM_REGION_READ,
 +   DM_REGION_WRITE, and DM_REGION_TRUNCATE flags for that single region.
 +*/
 +
 +STATIC int
 +xfs_dm_get_region(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           nelem,
 +      dm_region_t     __user *regbufp,
 +      u_int           __user *nelemp)
 +{
 +      dm_eventset_t   evmask;
 +      dm_region_t     region;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +      u_int           elem;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      evmask = ip->i_d.di_dmevmask;   /* read the mask "atomically" */
 +
 +      /* Get the file's current managed region flags out of the
 +         dm_eventset_t mask and use them to build a managed region that
 +         covers the entire file, i.e. set rg_offset and rg_size to zero.
 +      */
 +
 +      memset((char *)&region, 0, sizeof(region));
 +
 +      if (evmask & (1 << DM_EVENT_READ))
 +              region.rg_flags |= DM_REGION_READ;
 +      if (evmask & (1 << DM_EVENT_WRITE))
 +              region.rg_flags |= DM_REGION_WRITE;
 +      if (evmask & (1 << DM_EVENT_TRUNCATE))
 +              region.rg_flags |= DM_REGION_TRUNCATE;
 +
 +      elem = (region.rg_flags ? 1 : 0);
 +
 +      if (copy_to_user( nelemp, &elem, sizeof(elem)))
 +              return(-EFAULT);
 +      if (elem > nelem)
 +              return(-E2BIG);
 +      if (elem && copy_to_user(regbufp, &region, sizeof(region)))
 +              return(-EFAULT);
 +      return(0);
 +}
 +
 +
 +STATIC int
 +xfs_dm_getall_dmattr(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      size_t          buflen,
 +      void            __user *bufp,
 +      size_t          __user *rlenp)
 +{
 +      attrlist_cursor_kern_t cursor;
 +      attrlist_t      *attrlist;
 +      dm_attrlist_t   __user *ulist;
 +      int             *last_link;
 +      int             alignment;
 +      int             total_size;
 +      int             list_size = 8192;       /* should be big enough */
 +      int             error;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      /* Verify that the user gave us a buffer that is 4-byte aligned, lock
 +         it down, and work directly within that buffer.  As a side-effect,
 +         values of buflen < sizeof(int) return EINVAL.
 +      */
 +
 +      alignment = sizeof(int) - 1;
 +      if ((((__psint_t)bufp & alignment) != 0) ||
 +               !access_ok(VERIFY_WRITE, bufp, buflen)) {
 +              return(-EFAULT);
 +      }
 +      buflen &= ~alignment;           /* round down the alignment */
 +
 +      /* Initialize all the structures and variables for the main loop. */
 +
 +      memset(&cursor, 0, sizeof(cursor));
 +      attrlist = (attrlist_t *)kmem_alloc(list_size, KM_SLEEP);
 +      total_size = 0;
 +      ulist = (dm_attrlist_t *)bufp;
 +      last_link = NULL;
 +
 +      /* Use vop_attr_list to get the names of DMAPI attributes, and use
 +         vop_attr_get to get their values.  There is a risk here that the
 +         DMAPI attributes could change between the vop_attr_list and
 +         vop_attr_get calls.  If we can detect it, we return EIO to notify
 +         the user.
 +      */
 +
 +      do {
 +              int     i;
 +
 +              /* Get a buffer full of attribute names.  If there aren't any
 +                 more or if we encounter an error, then finish up.
 +              */
 +
 +              error = xfs_attr_list(XFS_I(inode), (char *)attrlist, list_size,
 +                                              ATTR_ROOT, &cursor);
 +              DM_EA_XLATE_ERR(error);
 +
 +              if (error || attrlist->al_count == 0)
 +                      break;
 +
 +              for (i = 0; i < attrlist->al_count; i++) {
 +                      attrlist_ent_t  *entry;
 +                      char            *user_name;
 +                      int             size_needed;
 +                      int             value_len;
 +
 +                      /* Skip over all non-DMAPI attributes.  If the
 +                         attribute name is too long, we assume it is
 +                         non-DMAPI even if it starts with the correct
 +                         prefix.
 +                      */
 +
 +                      entry = ATTR_ENTRY(attrlist, i);
 +                      if (strncmp(entry->a_name, dmattr_prefix, DMATTR_PREFIXLEN))
 +                              continue;
 +                      user_name = &entry->a_name[DMATTR_PREFIXLEN];
 +                      if (strlen(user_name) > DM_ATTR_NAME_SIZE)
 +                              continue;
 +
 +                      /* We have a valid DMAPI attribute to return.  If it
 +                         won't fit in the user's buffer, we still need to
 +                         keep track of the number of bytes for the user's
 +                         next call.
 +                      */
 +
 +
 +                      size_needed = sizeof(*ulist) + entry->a_valuelen;
 +                      size_needed = (size_needed + alignment) & ~alignment;
 +
 +                      total_size += size_needed;
 +                      if (total_size > buflen)
 +                              continue;
 +
 +                      /* Start by filling in all the fields in the
 +                         dm_attrlist_t structure.
 +                      */
 +
 +                      strncpy((char *)ulist->al_name.an_chars, user_name,
 +                              DM_ATTR_NAME_SIZE);
 +                      ulist->al_data.vd_offset = sizeof(*ulist);
 +                      ulist->al_data.vd_length = entry->a_valuelen;
 +                      ulist->_link =  size_needed;
 +                      last_link = &ulist->_link;
 +
 +                      /* Next read the attribute's value into its correct
 +                         location after the dm_attrlist structure.  Any sort
 +                         of error indicates that the data is moving under us,
 +                         so we return EIO to let the user know.
 +                      */
 +
 +                      value_len = entry->a_valuelen;
 +
 +                      error = xfs_attr_get(XFS_I(inode), entry->a_name,
 +                                              (void *)(ulist + 1), &value_len,
 +                                              ATTR_ROOT);
 +                      DM_EA_XLATE_ERR(error);
 +
 +                      if (error || value_len != entry->a_valuelen) {
 +                              error = EIO;
 +                              break;
 +                      }
 +
 +                      ulist = (dm_attrlist_t *)((char *)ulist + ulist->_link);
 +              }
 +      } while (!error && attrlist->al_more);
 +      if (last_link)
 +              *last_link = 0;
 +
 +      if (!error && total_size > buflen)
 +              error = E2BIG;
 +      if (!error || error == E2BIG) {
 +              if (put_user(total_size, rlenp))
 +                      error = EFAULT;
 +      }
 +
 +      kmem_free(attrlist);
 +      return(-error); /* Return negative error to DMAPI */
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_getall_inherit(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           nelem,
 +      dm_inherit_t    __user *inheritbufp,
 +      u_int           __user *nelemp)
 +{
 +      return(-ENOSYS); /* Return negative error to DMAPI */
 +}
 +
 +
 +/* Initialize location pointer for subsequent dm_get_dirattrs,
 +   dm_get_bulkattr, and dm_get_bulkall calls.  The same initialization must
 +   work for inode-based routines (dm_get_dirattrs) and filesystem-based
 +   routines (dm_get_bulkattr and dm_get_bulkall).  Filesystem-based functions
 +   call this routine using the filesystem's root inode.
 +*/
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_init_attrloc(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_attrloc_t    __user *locp)
 +{
 +      dm_attrloc_t    loc = 0;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      if (copy_to_user( locp, &loc, sizeof(loc)))
 +              return(-EFAULT);
 +      return(0);
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_mkdir_by_handle(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      void            __user *hanp,
 +      size_t          hlen,
 +      char            __user *cname)
 +{
 +      return(-ENOSYS); /* Return negative error to DMAPI */
 +}
 +
 +
 +/*
 + * Probe and Punch
 + *
 + * Hole punching alignment is based on the underlying device base
 + * allocation size. Because it is not defined in the DMAPI spec, we
 + * can align how we choose here. Round inwards (offset up and length
 + * down) to the block, extent or page size whichever is bigger. Our
 + * DMAPI implementation rounds the hole geometry strictly inwards. If
 + * this is not possible, return EINVAL for both for xfs_dm_probe_hole
 + * and xfs_dm_punch_hole which differs from the DMAPI spec.  Note that
 + * length = 0 is special - it means "punch to EOF" and at that point
 + * we treat the punch as remove everything past offset (including
 + * preallocation past EOF).
 + */
 +
 +STATIC int
 +xfs_dm_round_hole(
 +      dm_off_t        offset,
 +      dm_size_t       length,
 +      dm_size_t       align,
 +      xfs_fsize_t     filesize,
 +      dm_off_t        *roff,
 +      dm_size_t       *rlen)
 +{
 +
 +      dm_off_t        off = offset;
 +      dm_size_t       len = length;
 +
 +      /* Try to round offset up to the nearest boundary */
 +      *roff = roundup_64(off, align);
 +      if ((*roff >= filesize) || (len && (len < align)))
 +              return -EINVAL;
 +
 +      if ((len == 0) || ((off + len) == filesize)) {
 +              /* punch to EOF */
 +              *rlen = 0;
 +      } else {
 +              /* Round length down to the nearest boundary. */
 +              ASSERT(len >= align);
 +              ASSERT(align > (*roff - off));
 +              len -= *roff - off;
 +              *rlen = len - do_mod(len, align);
 +              if (*rlen == 0)
 +                      return -EINVAL; /* requested length is too small */
 +      }
 +#ifdef CONFIG_DMAPI_DEBUG
 +      printk("xfs_dm_round_hole: off %lu, len %ld, align %lu, "
 +             "filesize %llu, roff %ld, rlen %ld\n",
 +             offset, length, align, filesize, *roff, *rlen);
 +#endif
 +      return 0; /* hole geometry successfully rounded */
 +}
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_probe_hole(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_off_t        off,
 +      dm_size_t       len,
 +      dm_off_t        __user  *roffp,
 +      dm_size_t       __user *rlenp)
 +{
 +      dm_off_t        roff;
 +      dm_size_t       rlen;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +      xfs_mount_t     *mp;
 +      uint            lock_flags;
 +      xfs_fsize_t     realsize;
 +      dm_size_t       align;
 +      int             error;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return -EACCES;
 +
 +      if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
 +              return -EINVAL;
 +
 +      mp = ip->i_mount;
 +      lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
 +      xfs_ilock(ip, lock_flags);
 +      realsize = ip->i_size;
 +      xfs_iunlock(ip, lock_flags);
 +
 +      if ((off + len) > realsize)
 +              return -E2BIG;
 +
 +      align = 1 << mp->m_sb.sb_blocklog;
 +
 +      error = xfs_dm_round_hole(off, len, align, realsize, &roff, &rlen);
 +      if (error)
 +              return error;
 +
 +      if (copy_to_user( roffp, &roff, sizeof(roff)))
 +              return -EFAULT;
 +      if (copy_to_user( rlenp, &rlen, sizeof(rlen)))
 +              return -EFAULT;
 +      return(0);
 +}
 +
 +
 +STATIC int
 +xfs_dm_punch_hole(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_off_t        off,
 +      dm_size_t       len)
 +{
 +      xfs_flock64_t   bf;
 +      int             error = 0;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +      xfs_mount_t     *mp;
 +      dm_size_t       align;
 +      xfs_fsize_t     realsize;
 +      dm_off_t        roff;
 +      dm_size_t       rlen;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return -EACCES;
 +
 +      /* Make sure there are no leases. */
 +      error = break_lease(inode, FMODE_WRITE);
 +      if (error)
 +              return -EBUSY;
 +
 +      error = get_write_access(inode);
 +      if (error)
 +              return -EBUSY;
 +
 +      mp = ip->i_mount;
 +
 +      down_rw_sems(inode, DM_SEM_FLAG_WR);
 +
 +      xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 +      realsize = ip->i_size;
 +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
 +      align = xfs_get_extsz_hint(ip);
 +      if (align == 0)
 +              align = 1;
 +
 +      align <<= mp->m_sb.sb_blocklog;
 +
 +      if ((off + len) > realsize) {
 +              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 +              error = -E2BIG;
 +              goto up_and_out;
 +      }
 +
 +      if ((off + len) == realsize)
 +              len = 0;
 +
 +      error = xfs_dm_round_hole(off, len, align, realsize, &roff, &rlen);
 +      if (error || (off != roff) || (len != rlen)) {
 +              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 +              error = -EINVAL;
 +              goto up_and_out;
 +      }
 +
 +      bf.l_type = 0;
 +      bf.l_whence = 0;
 +      bf.l_start = (xfs_off_t)off;
 +      if (len) {
 +              bf.l_len = len;
 +      }
 +      else {
 +              /*
 +               * When we are punching to EOF, we have to make sure we punch
 +               * the last partial block that contains EOF. Round up
 +               * the length to make sure we punch the block and not just
 +               * zero it.
 +               */
 +              bf.l_len = roundup_64((realsize - off), mp->m_sb.sb_blocksize);
 +      }
 +
 +#ifdef CONFIG_DMAPI_DEBUG
 +      printk("xfs_dm_punch_hole: off %lu, len %ld, align %lu\n",
 +              off, len, align);
 +#endif
 +
 +      error = xfs_change_file_space(ip, XFS_IOC_UNRESVSP, &bf,
 +                              (xfs_off_t)off, XFS_ATTR_DMI|XFS_ATTR_NOLOCK);
 +
 +      /*
 +       * if punching to end of file, kill any blocks past EOF that
 +       * may have been (speculatively) preallocated. No point in
 +       * leaving them around if we are migrating the file....
 +       */
 +      if (!error && (len == 0)) {
 +              error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_HASLOCK);
 +      }
 +
 +      /*
 +       * negate the error for return here as core XFS functions return
 +       * positive error numbers
 +       */
 +      if (error)
 +              error = -error;
 +
 +      /* Let threads in send_data_event know we punched the file. */
 +      ip->i_d.di_dmstate++;
 +      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 +
 +up_and_out:
 +      up_rw_sems(inode, DM_SEM_FLAG_WR);
 +      put_write_access(inode);
 +
 +      return error;
 +}
 +
 +
 +STATIC int
 +xfs_dm_read_invis_rvp(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_off_t        off,
 +      dm_size_t       len,
 +      void            __user *bufp,
 +      int             *rvp)
 +{
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_SHARED)
 +              return(-EACCES);
 +
 +      return(-xfs_dm_rdwr(inode, 0, FMODE_READ, off, len, bufp, rvp));
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_release_right(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           type)           /* DM_FSYS_OBJ or zero */
 +{
 +#ifdef        DEBUG_RIGHTS
 +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
 +
 +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
 +              printf("dm_release_right: old %d type %d handle %s\n",
 +                      right, type, buffer);
 +      } else {
 +              printf("dm_release_right: old %d type %d handle "
 +                      " <INVALID>\n", right, type);
 +      }
 +#endif        /* DEBUG_RIGHTS */
 +      return(0);
 +}
 +
 +
 +STATIC int
 +xfs_dm_remove_dmattr(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      int             setdtime,
 +      dm_attrname_t   __user *attrnamep)
 +{
 +      dm_dkattrname_t name;
 +      int             error;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return(-EACCES);
 +
 +      if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
 +              return(-error); /* Return negative error to DMAPI */
 +
 +      /* Remove the attribute from the object. */
 +
 +      error = xfs_attr_remove(XFS_I(inode), name.dan_chars, setdtime ?
 +                              ATTR_ROOT : (ATTR_ROOT|ATTR_KERNOTIME));
 +      DM_EA_XLATE_ERR(error);
 +
 +      if (error == ENOATTR)
 +              error = ENOENT;
 +      return(-error); /* Return negative error to DMAPI */
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_request_right(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           type,           /* DM_FSYS_OBJ or zero */
 +      u_int           flags,
 +      dm_right_t      newright)
 +{
 +#ifdef        DEBUG_RIGHTS
 +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
 +
 +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
 +              printf("dm_request_right: old %d new %d type %d flags 0x%x "
 +                      "handle %s\n", right, newright, type, flags, buffer);
 +      } else {
 +              printf("dm_request_right: old %d new %d type %d flags 0x%x "
 +                      "handle <INVALID>\n", right, newright, type, flags);
 +      }
 +#endif        /* DEBUG_RIGHTS */
 +      return(0);
 +}
 +
 +
 +STATIC int
 +xfs_dm_set_dmattr(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_attrname_t   __user *attrnamep,
 +      int             setdtime,
 +      size_t          buflen,
 +      void            __user *bufp)
 +{
 +      dm_dkattrname_t name;
 +      char            *value;
 +      int             alloc_size;
 +      int             error;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return(-EACCES);
 +
 +      if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
 +              return(-error); /* Return negative error to DMAPI */
 +      if (buflen > ATTR_MAX_VALUELEN)
 +              return(-E2BIG);
 +
 +      /* Copy in the attribute's value and store the <name,value> pair in
 +         the object.  We allocate a buffer of at least one byte even if the
 +         caller specified a buflen of zero.  (A buflen of zero is considered
 +         valid.)
 +      */
 +
 +      alloc_size = (buflen == 0) ? 1 : buflen;
 +      value = kmem_alloc(alloc_size, KM_SLEEP);
 +      if (copy_from_user( value, bufp, buflen)) {
 +              error = EFAULT;
 +      } else {
 +              error = xfs_attr_set(XFS_I(inode), name.dan_chars, value, buflen,
 +                                      setdtime ? ATTR_ROOT :
 +                                      (ATTR_ROOT|ATTR_KERNOTIME));
 +              DM_EA_XLATE_ERR(error);
 +      }
 +      kmem_free(value);
 +      return(-error); /* Return negative error to DMAPI */
 +}
 +
 +STATIC int
 +xfs_dm_set_eventlist(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           type,
 +      dm_eventset_t   *eventsetp,     /* in kernel space! */
 +      u_int           maxevent)
 +{
 +      int             error;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (type == DM_FSYS_OBJ) {
 +              error = xfs_dm_fs_set_eventlist(ip->i_mount, right, eventsetp, maxevent);
 +      } else {
 +              error = xfs_dm_f_set_eventlist(ip, right, eventsetp, maxevent);
 +      }
 +      return(-error); /* Return negative error to DMAPI */
 +}
 +
 +
 +/*
 + *  This turned out not XFS-specific, but leave it here with get_fileattr.
 + */
 +
 +STATIC int
 +xfs_dm_set_fileattr(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           mask,
 +      dm_fileattr_t   __user *statp)
 +{
 +      dm_fileattr_t   stat;
 +      struct iattr    iattr;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return(-EACCES);
 +
 +      if (copy_from_user( &stat, statp, sizeof(stat)))
 +              return(-EFAULT);
 +
 +      iattr.ia_valid = 0;
 +
 +      if (mask & DM_AT_MODE) {
 +              iattr.ia_valid |= ATTR_MODE;
 +              iattr.ia_mode = stat.fa_mode;
 +      }
 +      if (mask & DM_AT_UID) {
 +              iattr.ia_valid |= ATTR_UID;
 +              iattr.ia_uid = stat.fa_uid;
 +      }
 +      if (mask & DM_AT_GID) {
 +              iattr.ia_valid |= ATTR_GID;
 +              iattr.ia_gid = stat.fa_gid;
 +      }
 +      if (mask & DM_AT_ATIME) {
 +              iattr.ia_valid |= ATTR_ATIME;
 +              iattr.ia_atime.tv_sec = stat.fa_atime;
 +              iattr.ia_atime.tv_nsec = 0;
 +                inode->i_atime.tv_sec = stat.fa_atime;
 +      }
 +      if (mask & DM_AT_MTIME) {
 +              iattr.ia_valid |= ATTR_MTIME;
 +              iattr.ia_mtime.tv_sec = stat.fa_mtime;
 +              iattr.ia_mtime.tv_nsec = 0;
 +      }
 +      if (mask & DM_AT_CTIME) {
 +              iattr.ia_valid |= ATTR_CTIME;
 +              iattr.ia_ctime.tv_sec = stat.fa_ctime;
 +              iattr.ia_ctime.tv_nsec = 0;
 +      }
 +
 +      /*
 +       * DM_AT_DTIME only takes effect if DM_AT_CTIME is not specified.  We
 +       * overload ctime to also act as dtime, i.e. DM_CONFIG_DTIME_OVERLOAD.
 +       */
 +      if ((mask & DM_AT_DTIME) && !(mask & DM_AT_CTIME)) {
 +              iattr.ia_valid |= ATTR_CTIME;
 +              iattr.ia_ctime.tv_sec = stat.fa_dtime;
 +              iattr.ia_ctime.tv_nsec = 0;
 +      }
 +      if (mask & DM_AT_SIZE) {
 +              iattr.ia_valid |= ATTR_SIZE;
 +              iattr.ia_size = stat.fa_size;
 +      }
 +
 +      return -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_DMI);
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_set_inherit(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      dm_attrname_t   __user *attrnamep,
 +      mode_t          mode)
 +{
 +      return(-ENOSYS); /* Return negative error to DMAPI */
 +}
 +
 +
 +STATIC int
 +xfs_dm_set_region(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           nelem,
 +      dm_region_t     __user *regbufp,
 +      dm_boolean_t    __user *exactflagp)
 +{
 +      xfs_inode_t     *ip = XFS_I(inode);
 +      xfs_trans_t     *tp;
 +      xfs_mount_t     *mp;
 +      dm_region_t     region;
 +      dm_eventset_t   new_mask;
 +      dm_eventset_t   mr_mask;
 +      int             error;
 +      u_int           exactflag;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return(-EACCES);
 +
 +      /* If the caller gave us more than one dm_region_t structure, complain.
 +         (He has to call dm_get_config() to find out what our limit is.)
 +      */
 +
 +      if (nelem > 1)
 +              return(-E2BIG);
 +
 +      /* If the user provided a dm_region_t structure, then copy it in,
 +         validate it, and convert its flags to the corresponding bits in a
 +         dm_set_eventlist() event mask.  A call with zero regions is
 +         equivalent to clearing all region flags.
 +      */
 +
 +      new_mask = 0;
 +      if (nelem == 1) {
 +              if (copy_from_user( &region, regbufp, sizeof(region)))
 +                      return(-EFAULT);
 +
 +              if (region.rg_flags & ~(DM_REGION_READ|DM_REGION_WRITE|DM_REGION_TRUNCATE))
 +                      return(-EINVAL);
 +              if (region.rg_flags & DM_REGION_READ)
 +                      new_mask |= 1 << DM_EVENT_READ;
 +              if (region.rg_flags & DM_REGION_WRITE)
 +                      new_mask |= 1 << DM_EVENT_WRITE;
 +              if (region.rg_flags & DM_REGION_TRUNCATE)
 +                      new_mask |= 1 << DM_EVENT_TRUNCATE;
 +      }
 +      mr_mask = (1 << DM_EVENT_READ) | (1 << DM_EVENT_WRITE) | (1 << DM_EVENT_TRUNCATE);
 +
 +      /* Get the file's existing event mask, clear the old managed region
 +         bits, add in the new ones, and update the file's mask.
 +      */
 +
 +      if (new_mask & prohibited_mr_events(inode->i_mapping)) {
 +              /* If the change is simply to remove the READ
 +               * bit, then that's always okay.  Otherwise, it's busy.
 +               */
 +              dm_eventset_t m1;
 +              m1 = ip->i_d.di_dmevmask & ((1 << DM_EVENT_WRITE) | (1 << DM_EVENT_TRUNCATE));
 +              if (m1 != new_mask) {
 +                      return -EBUSY;
 +              }
 +      }
 +
 +      mp = ip->i_mount;
 +      tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
 +      error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
 +      if (error) {
 +              xfs_trans_cancel(tp, 0);
 +              return(-error); /* Return negative error to DMAPI */
 +      }
 +      xfs_ilock(ip, XFS_ILOCK_EXCL);
 +      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 +
 +      ip->i_d.di_dmevmask = (ip->i_d.di_dmevmask & ~mr_mask) | new_mask;
 +
 +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 +      igrab(inode);
 +      xfs_trans_commit(tp, 0);
 +
 +      /* Return the proper value for *exactflagp depending upon whether or not
 +         we "changed" the user's managed region.  In other words, if the user
 +         specified a non-zero value for either rg_offset or rg_size, we
 +         round each of those values back to zero.
 +      */
 +
 +      if (nelem && (region.rg_offset || region.rg_size)) {
 +              exactflag = DM_FALSE;   /* user region was changed */
 +      } else {
 +              exactflag = DM_TRUE;    /* user region was unchanged */
 +      }
 +      if (copy_to_user( exactflagp, &exactflag, sizeof(exactflag)))
 +              return(-EFAULT);
 +      return(0);
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_symlink_by_handle(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      void __user     *hanp,
 +      size_t          hlen,
 +      char            __user *cname,
 +      char            __user *path)
 +{
 +      return(-ENOSYS); /* Return negative errors to DMAPI */
 +}
 +
 +
 +/*
 + * xfs_dm_sync_by_handle needs to do the same thing as sys_fsync()
 + */
 +STATIC int
 +xfs_dm_sync_by_handle(
 +      struct inode    *inode,
 +      dm_right_t      right)
 +{
 +      int             err, ret;
 +      xfs_inode_t     *ip = XFS_I(inode);
 +
 +      /* Returns negative errors to DMAPI */
 +      if (right < DM_RIGHT_EXCL)
 +              return(-EACCES);
 +
 +      /* We need to protect against concurrent writers.. */
 +      ret = filemap_fdatawrite(inode->i_mapping);
 +      down_rw_sems(inode, DM_FLAGS_IMUX);
-       err = -xfs_fsync(ip);
++      err = xfs_fsync(inode, 1);
 +      if (!ret)
 +              ret = err;
 +      up_rw_sems(inode, DM_FLAGS_IMUX);
 +      err = filemap_fdatawait(inode->i_mapping);
 +      if (!ret)
 +              ret = err;
 +      xfs_iflags_clear(ip, XFS_ITRUNCATED);
 +      return ret;
 +}
 +
 +
 +/* ARGSUSED */
 +STATIC int
 +xfs_dm_upgrade_right(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      u_int           type)           /* DM_FSYS_OBJ or zero */
 +{
 +#ifdef        DEBUG_RIGHTS
 +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
 +
 +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
 +              printf("dm_upgrade_right: old %d new %d type %d handle %s\n",
 +                      right, DM_RIGHT_EXCL, type, buffer);
 +      } else {
 +              printf("dm_upgrade_right: old %d new %d type %d handle "
 +                      "<INVALID>\n", right, DM_RIGHT_EXCL, type);
 +      }
 +#endif        /* DEBUG_RIGHTS */
 +      return(0);
 +}
 +
 +
 +STATIC int
 +xfs_dm_write_invis_rvp(
 +      struct inode    *inode,
 +      dm_right_t      right,
 +      int             flags,
 +      dm_off_t        off,
 +      dm_size_t       len,
 +      void __user     *bufp,
 +      int             *rvp)
 +{
 +      int             fflag = 0;
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (right < DM_RIGHT_EXCL)
 +              return(-EACCES);
 +
 +      if (flags & DM_WRITE_SYNC)
 +              fflag |= O_SYNC;
 +      return(-xfs_dm_rdwr(inode, fflag, FMODE_WRITE, off, len, bufp, rvp));
 +}
 +
 +
 +STATIC void
 +xfs_dm_obj_ref_hold(
 +      struct inode    *inode)
 +{
 +      igrab(inode);
 +}
 +
 +
 +static fsys_function_vector_t xfs_fsys_vector[DM_FSYS_MAX];
 +
 +
 +STATIC int
 +xfs_dm_get_dmapiops(
 +      struct super_block      *sb,
 +      void                    *addr)
 +{
 +      static  int             initialized = 0;
 +      dm_fcntl_vector_t       *vecrq;
 +      fsys_function_vector_t  *vecp;
 +      int                     i = 0;
 +
 +      vecrq = (dm_fcntl_vector_t *)addr;
 +      vecrq->count =
 +              sizeof(xfs_fsys_vector) / sizeof(xfs_fsys_vector[0]);
 +      vecrq->vecp = xfs_fsys_vector;
 +      if (initialized)
 +              return(0);
 +      vecrq->code_level = DM_CLVL_XOPEN;
 +      vecp = xfs_fsys_vector;
 +
 +      vecp[i].func_no = DM_FSYS_CLEAR_INHERIT;
 +      vecp[i++].u_fc.clear_inherit = xfs_dm_clear_inherit;
 +      vecp[i].func_no = DM_FSYS_CREATE_BY_HANDLE;
 +      vecp[i++].u_fc.create_by_handle = xfs_dm_create_by_handle;
 +      vecp[i].func_no = DM_FSYS_DOWNGRADE_RIGHT;
 +      vecp[i++].u_fc.downgrade_right = xfs_dm_downgrade_right;
 +      vecp[i].func_no = DM_FSYS_GET_ALLOCINFO_RVP;
 +      vecp[i++].u_fc.get_allocinfo_rvp = xfs_dm_get_allocinfo_rvp;
 +      vecp[i].func_no = DM_FSYS_GET_BULKALL_RVP;
 +      vecp[i++].u_fc.get_bulkall_rvp = xfs_dm_get_bulkall_rvp;
 +      vecp[i].func_no = DM_FSYS_GET_BULKATTR_RVP;
 +      vecp[i++].u_fc.get_bulkattr_rvp = xfs_dm_get_bulkattr_rvp;
 +      vecp[i].func_no = DM_FSYS_GET_CONFIG;
 +      vecp[i++].u_fc.get_config = xfs_dm_get_config;
 +      vecp[i].func_no = DM_FSYS_GET_CONFIG_EVENTS;
 +      vecp[i++].u_fc.get_config_events = xfs_dm_get_config_events;
 +      vecp[i].func_no = DM_FSYS_GET_DESTROY_DMATTR;
 +      vecp[i++].u_fc.get_destroy_dmattr = xfs_dm_get_destroy_dmattr;
 +      vecp[i].func_no = DM_FSYS_GET_DIOINFO;
 +      vecp[i++].u_fc.get_dioinfo = xfs_dm_get_dioinfo;
 +      vecp[i].func_no = DM_FSYS_GET_DIRATTRS_RVP;
 +      vecp[i++].u_fc.get_dirattrs_rvp = xfs_dm_get_dirattrs_rvp;
 +      vecp[i].func_no = DM_FSYS_GET_DMATTR;
 +      vecp[i++].u_fc.get_dmattr = xfs_dm_get_dmattr;
 +      vecp[i].func_no = DM_FSYS_GET_EVENTLIST;
 +      vecp[i++].u_fc.get_eventlist = xfs_dm_get_eventlist;
 +      vecp[i].func_no = DM_FSYS_GET_FILEATTR;
 +      vecp[i++].u_fc.get_fileattr = xfs_dm_get_fileattr;
 +      vecp[i].func_no = DM_FSYS_GET_REGION;
 +      vecp[i++].u_fc.get_region = xfs_dm_get_region;
 +      vecp[i].func_no = DM_FSYS_GETALL_DMATTR;
 +      vecp[i++].u_fc.getall_dmattr = xfs_dm_getall_dmattr;
 +      vecp[i].func_no = DM_FSYS_GETALL_INHERIT;
 +      vecp[i++].u_fc.getall_inherit = xfs_dm_getall_inherit;
 +      vecp[i].func_no = DM_FSYS_INIT_ATTRLOC;
 +      vecp[i++].u_fc.init_attrloc = xfs_dm_init_attrloc;
 +      vecp[i].func_no = DM_FSYS_MKDIR_BY_HANDLE;
 +      vecp[i++].u_fc.mkdir_by_handle = xfs_dm_mkdir_by_handle;
 +      vecp[i].func_no = DM_FSYS_PROBE_HOLE;
 +      vecp[i++].u_fc.probe_hole = xfs_dm_probe_hole;
 +      vecp[i].func_no = DM_FSYS_PUNCH_HOLE;
 +      vecp[i++].u_fc.punch_hole = xfs_dm_punch_hole;
 +      vecp[i].func_no = DM_FSYS_READ_INVIS_RVP;
 +      vecp[i++].u_fc.read_invis_rvp = xfs_dm_read_invis_rvp;
 +      vecp[i].func_no = DM_FSYS_RELEASE_RIGHT;
 +      vecp[i++].u_fc.release_right = xfs_dm_release_right;
 +      vecp[i].func_no = DM_FSYS_REMOVE_DMATTR;
 +      vecp[i++].u_fc.remove_dmattr = xfs_dm_remove_dmattr;
 +      vecp[i].func_no = DM_FSYS_REQUEST_RIGHT;
 +      vecp[i++].u_fc.request_right = xfs_dm_request_right;
 +      vecp[i].func_no = DM_FSYS_SET_DMATTR;
 +      vecp[i++].u_fc.set_dmattr = xfs_dm_set_dmattr;
 +      vecp[i].func_no = DM_FSYS_SET_EVENTLIST;
 +      vecp[i++].u_fc.set_eventlist = xfs_dm_set_eventlist;
 +      vecp[i].func_no = DM_FSYS_SET_FILEATTR;
 +      vecp[i++].u_fc.set_fileattr = xfs_dm_set_fileattr;
 +      vecp[i].func_no = DM_FSYS_SET_INHERIT;
 +      vecp[i++].u_fc.set_inherit = xfs_dm_set_inherit;
 +      vecp[i].func_no = DM_FSYS_SET_REGION;
 +      vecp[i++].u_fc.set_region = xfs_dm_set_region;
 +      vecp[i].func_no = DM_FSYS_SYMLINK_BY_HANDLE;
 +      vecp[i++].u_fc.symlink_by_handle = xfs_dm_symlink_by_handle;
 +      vecp[i].func_no = DM_FSYS_SYNC_BY_HANDLE;
 +      vecp[i++].u_fc.sync_by_handle = xfs_dm_sync_by_handle;
 +      vecp[i].func_no = DM_FSYS_UPGRADE_RIGHT;
 +      vecp[i++].u_fc.upgrade_right = xfs_dm_upgrade_right;
 +      vecp[i].func_no = DM_FSYS_WRITE_INVIS_RVP;
 +      vecp[i++].u_fc.write_invis_rvp = xfs_dm_write_invis_rvp;
 +      vecp[i].func_no = DM_FSYS_OBJ_REF_HOLD;
 +      vecp[i++].u_fc.obj_ref_hold = xfs_dm_obj_ref_hold;
 +
 +      return(0);
 +}
 +
 +
 +/*    xfs_dm_send_mmap_event - send events needed for memory mapping a file.
 + *
 + *    This is a workaround called for files that are about to be
 + *    mapped.  DMAPI events are not being generated at a low enough level
 + *    in the kernel for page reads/writes to generate the correct events.
 + *    So for memory-mapped files we generate read  or write events for the
 + *    whole byte range being mapped.  If the mmap call can never cause a
 + *    write to the file, then only a read event is sent.
 + *
 + *    Code elsewhere prevents adding managed regions to a file while it
 + *    is still mapped.
 + */
 +
 +STATIC int
 +xfs_dm_send_mmap_event(
 +      struct vm_area_struct *vma,
 +      unsigned int    wantflag)
 +{
 +      xfs_inode_t     *ip;
 +      int             error = 0;
 +      dm_eventtype_t  max_event = DM_EVENT_READ;
 +      xfs_fsize_t     filesize;
 +      xfs_off_t       length, end_of_area, evsize, offset;
 +      int             iolock;
 +
 +      if (!vma->vm_file)
 +              return 0;
 +
 +      ip = XFS_I(vma->vm_file->f_dentry->d_inode);
 +
 +      if (!S_ISREG(vma->vm_file->f_dentry->d_inode->i_mode) ||
 +          !(ip->i_mount->m_flags & XFS_MOUNT_DMAPI))
 +              return 0;
 +
 +      /* If they specifically asked for 'read', then give it to them.
 +       * Otherwise, see if it's possible to give them 'write'.
 +       */
 +      if( wantflag & VM_READ ){
 +              max_event = DM_EVENT_READ;
 +      }
 +      else if( ! (vma->vm_flags & VM_DENYWRITE) ) {
 +              if((wantflag & VM_WRITE) || (vma->vm_flags & VM_WRITE))
 +                      max_event = DM_EVENT_WRITE;
 +      }
 +
 +      if( (wantflag & VM_WRITE) && (max_event != DM_EVENT_WRITE) ){
 +              return -EACCES;
 +      }
 +
 +      /* Figure out how much of the file is being requested by the user. */
 +      offset = 0; /* beginning of file, for now */
 +      length = 0; /* whole file, for now */
 +
 +      filesize = ip->i_new_size;
 +      if (filesize < ip->i_size) {
 +              filesize = ip->i_size;
 +      }
 +
 +      /* Set first byte number beyond the map area. */
 +
 +      if (length) {
 +              end_of_area = offset + length;
 +              if (end_of_area > filesize)
 +                      end_of_area = filesize;
 +      } else {
 +              end_of_area = filesize;
 +      }
 +
 +      /* Set the real amount being mapped. */
 +      evsize = end_of_area - offset;
 +      if (evsize < 0)
 +              evsize = 0;
 +
 +      if (max_event == DM_EVENT_READ)
 +              iolock = XFS_IOLOCK_SHARED;
 +      else
 +              iolock = XFS_IOLOCK_EXCL;
 +
 +      xfs_ilock(ip, iolock);
 +      /* If write possible, try a DMAPI write event */
 +      if (max_event == DM_EVENT_WRITE && DM_EVENT_ENABLED(ip, max_event)) {
 +              error = xfs_dm_send_data_event(max_event, ip, offset,
 +                                             evsize, 0, &iolock);
 +              goto out_unlock;
 +      }
 +
 +      /* Try a read event if max_event was != DM_EVENT_WRITE or if it
 +       * was DM_EVENT_WRITE but the WRITE event was not enabled.
 +       */
 +      if (DM_EVENT_ENABLED(ip, DM_EVENT_READ)) {
 +              error = xfs_dm_send_data_event(DM_EVENT_READ, ip, offset,
 +                                             evsize, 0, &iolock);
 +      }
 +out_unlock:
 +      xfs_iunlock(ip, iolock);
 +      return -error;
 +}
 +
 +
 +STATIC int
 +xfs_dm_send_destroy_event(
 +      xfs_inode_t     *ip,
 +      dm_right_t      vp_right)       /* always DM_RIGHT_NULL */
 +{
 +      /* Returns positive errors to XFS */
 +      return -dm_send_destroy_event(&ip->i_vnode, vp_right);
 +}
 +
 +
 +STATIC int
 +xfs_dm_send_namesp_event(
 +      dm_eventtype_t  event,
 +      struct xfs_mount *mp,
 +      xfs_inode_t     *ip1,
 +      dm_right_t      vp1_right,
 +      xfs_inode_t     *ip2,
 +      dm_right_t      vp2_right,
 +      const char      *name1,
 +      const char      *name2,
 +      mode_t          mode,
 +      int             retcode,
 +      int             flags)
 +{
 +      /* Returns positive errors to XFS */
 +      return -dm_send_namesp_event(event, mp ? mp->m_super : NULL,
 +                                  &ip1->i_vnode, vp1_right,
 +                                  ip2 ? &ip2->i_vnode : NULL, vp2_right,
 +                                  name1, name2,
 +                                  mode, retcode, flags);
 +}
 +
 +STATIC int
 +xfs_dm_send_mount_event(
 +      struct xfs_mount        *mp,
 +      dm_right_t              root_right,
 +      char                    *mtpt,
 +      char                    *fsname)
 +{
 +      return dm_send_mount_event(mp->m_super, root_right,
 +                      NULL, DM_RIGHT_NULL,
 +                      mp->m_rootip ? VFS_I(mp->m_rootip) : NULL,
 +                      DM_RIGHT_NULL, mtpt, fsname);
 +}
 +
 +STATIC void
 +xfs_dm_send_unmount_event(
 +      struct xfs_mount *mp,
 +      xfs_inode_t     *ip,            /* NULL if unmount successful */
 +      dm_right_t      vfsp_right,
 +      mode_t          mode,
 +      int             retcode,        /* errno, if unmount failed */
 +      int             flags)
 +{
 +      dm_send_unmount_event(mp->m_super, ip ? &ip->i_vnode : NULL,
 +                            vfsp_right, mode, retcode, flags);
 +}
 +
 +
 +/*
 + * Data migration operations accessed by the rest of XFS.
 + * When DMAPI support is configured in, this vector is used.
 + */
 +
 +xfs_dmops_t   xfs_dmcore_xfs = {
 +      .xfs_send_data          = xfs_dm_send_data_event,
 +      .xfs_send_mmap          = xfs_dm_send_mmap_event,
 +      .xfs_send_destroy       = xfs_dm_send_destroy_event,
 +      .xfs_send_namesp        = xfs_dm_send_namesp_event,
 +      .xfs_send_mount         = xfs_dm_send_mount_event,
 +      .xfs_send_unmount       = xfs_dm_send_unmount_event,
 +};
 +EXPORT_SYMBOL(xfs_dmcore_xfs);
 +
 +STATIC int
 +xfs_dm_fh_to_inode(
 +      struct super_block      *sb,
 +      struct inode            **inode,
 +      dm_fid_t                *dmfid)
 +{
 +      xfs_mount_t             *mp = XFS_M(sb);
 +      xfs_inode_t             *ip;
 +      xfs_ino_t               ino;
 +      unsigned int            igen;
 +      int                     error;
 +
 +      *inode = NULL;
 +
 +      if (!dmfid->dm_fid_len) {
 +              /* filesystem handle */
 +              *inode = igrab(&mp->m_rootip->i_vnode);
 +              if (!*inode)
 +                      return -ENOENT;
 +              return 0;
 +      }
 +
 +      if (dmfid->dm_fid_len != sizeof(*dmfid) - sizeof(dmfid->dm_fid_len))
 +              return -EINVAL;
 +
 +      ino  = dmfid->dm_fid_ino;
 +      igen = dmfid->dm_fid_gen;
 +
 +      /* fail requests for ino 0 gracefully. */
 +      if (ino == 0)
 +              return -ESTALE;
 +
 +      error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
 +      if (error)
 +              return -error;
 +      if (!ip)
 +              return -EIO;
 +
 +      if (!ip->i_d.di_mode || ip->i_d.di_gen != igen) {
 +              xfs_iput_new(ip, XFS_ILOCK_SHARED);
 +              return -ENOENT;
 +      }
 +
 +      *inode = &ip->i_vnode;
 +      xfs_iunlock(ip, XFS_ILOCK_SHARED);
 +      return 0;
 +}
 +
 +STATIC int
 +xfs_dm_inode_to_fh(
 +      struct inode            *inode,
 +      dm_fid_t                *dmfid,
 +      dm_fsid_t               *dmfsid)
 +{
 +      xfs_inode_t             *ip = XFS_I(inode);
 +
 +      /* Returns negative errors to DMAPI */
 +
 +      if (ip->i_mount->m_fixedfsid == NULL)
 +              return -EINVAL;
 +
 +      dmfid->dm_fid_len = sizeof(dm_fid_t) - sizeof(dmfid->dm_fid_len);
 +      dmfid->dm_fid_pad = 0;
 +      /*
 +       * use memcpy because the inode is a long long and there's no
 +       * assurance that dmfid->dm_fid_ino is properly aligned.
 +       */
 +      memcpy(&dmfid->dm_fid_ino, &ip->i_ino, sizeof(dmfid->dm_fid_ino));
 +      dmfid->dm_fid_gen = ip->i_d.di_gen;
 +
 +      memcpy(dmfsid, ip->i_mount->m_fixedfsid, sizeof(*dmfsid));
 +      return 0;
 +}
 +
 +STATIC void
 +xfs_dm_get_fsid(
 +      struct super_block      *sb,
 +      dm_fsid_t               *fsid)
 +{
 +      memcpy(fsid, XFS_M(sb)->m_fixedfsid, sizeof(*fsid));
 +}
 +
 +/*
 + * Filesystem operations accessed by the DMAPI core.
 + */
 +static struct filesystem_dmapi_operations xfs_dmapiops = {
 +      .get_fsys_vector        = xfs_dm_get_dmapiops,
 +      .fh_to_inode            = xfs_dm_fh_to_inode,
 +      .inode_to_fh            = xfs_dm_inode_to_fh,
 +      .get_fsid               = xfs_dm_get_fsid,
 +};
 +
 +static int __init
 +xfs_dm_init(void)
 +{
 +      printk(KERN_INFO "SGI XFS Data Management API subsystem\n");
 +
 +      dmapi_register(&xfs_fs_type, &xfs_dmapiops);
 +      return 0;
 +}
 +
 +static void __exit
 +xfs_dm_exit(void)
 +{
 +      dmapi_unregister(&xfs_fs_type);
 +}
 +
 +MODULE_AUTHOR("Silicon Graphics, Inc.");
 +MODULE_DESCRIPTION("SGI XFS dmapi subsystem");
 +MODULE_LICENSE("GPL");
 +
 +module_init(xfs_dm_init);
 +module_exit(xfs_dm_exit);
  #include <linux/dcache.h>
  
  static const struct vm_operations_struct xfs_file_vm_ops;
 +#ifdef HAVE_DMAPI
 +static struct vm_operations_struct xfs_dmapi_file_vm_ops;
 +#endif
  
- STATIC ssize_t
- xfs_file_aio_read(
-       struct kiocb            *iocb,
-       const struct iovec      *iov,
-       unsigned long           nr_segs,
-       loff_t                  pos)
+ /*
+  *    xfs_iozero
+  *
+  *    xfs_iozero clears the specified range of buffer supplied,
+  *    and marks all the affected blocks as valid and modified.  If
+  *    an affected block is not allocated, it will be allocated.  If
+  *    an affected block is not completely overwritten, and is not
+  *    valid before the operation, it will be read from disk before
+  *    being partially zeroed.
+  */
+ STATIC int
+ xfs_iozero(
+       struct xfs_inode        *ip,    /* inode                        */
+       loff_t                  pos,    /* offset in file               */
+       size_t                  count)  /* size of data to zero         */
  {
-       struct file             *file = iocb->ki_filp;
-       int                     ioflags = 0;
+       struct page             *page;
+       struct address_space    *mapping;
+       int                     status;
  
-       BUG_ON(iocb->ki_pos != pos);
-       if (unlikely(file->f_flags & O_DIRECT))
-               ioflags |= IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-       return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
-                               nr_segs, &iocb->ki_pos, ioflags);
+       mapping = VFS_I(ip)->i_mapping;
+       do {
+               unsigned offset, bytes;
+               void *fsdata;
+               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+               bytes = PAGE_CACHE_SIZE - offset;
+               if (bytes > count)
+                       bytes = count;
+               status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                       AOP_FLAG_UNINTERRUPTIBLE,
+                                       &page, &fsdata);
+               if (status)
+                       break;
+               zero_user(page, offset, bytes);
+               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+                                       page, fsdata);
+               WARN_ON(status <= 0); /* can't return less than zero! */
+               pos += bytes;
+               count -= bytes;
+               status = 0;
+       } while (count);
+       return (-status);
  }
  
 -STATIC int
 -xfs_file_fsync(
 -      struct file             *file,
 -      struct dentry           *dentry,
 -      int                     datasync)
++int
++xfs_fsync(struct inode *inode, int datasync)
+ {
 -      struct xfs_inode        *ip = XFS_I(dentry->d_inode);
++      struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_trans        *tp;
+       int                     error = 0;
+       int                     log_flushed = 0;
+       xfs_itrace_entry(ip);
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -XFS_ERROR(EIO);
+       xfs_iflags_clear(ip, XFS_ITRUNCATED);
+       /*
+        * We always need to make sure that the required inode state is safe on
+        * disk.  The inode might be clean but we still might need to force the
+        * log because of committed transactions that haven't hit the disk yet.
+        * Likewise, there could be unflushed non-transactional changes to the
+        * inode core that have to go to disk and this requires us to issue
+        * a synchronous transaction to capture these changes correctly.
+        *
+        * This code relies on the assumption that if the i_update_core field
+        * of the inode is clear and the inode is unpinned then it is clean
+        * and no action is required.
+        */
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       /*
+        * First check if the VFS inode is marked dirty.  All the dirtying
+        * of non-transactional updates no goes through mark_inode_dirty*,
+        * which allows us to distinguish beteeen pure timestamp updates
+        * and i_size updates which need to be caught for fdatasync.
+        * After that also theck for the dirty state in the XFS inode, which
+        * might gets cleared when the inode gets written out via the AIL
+        * or xfs_iflush_cluster.
+        */
 -      if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
 -          ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
++      if (((inode->i_state & I_DIRTY_DATASYNC) ||
++          ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+           ip->i_update_core) {
+               /*
+                * Kick off a transaction to log the inode core to get the
+                * updates.  The sync transaction will also force the log.
+                */
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+               tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+               error = xfs_trans_reserve(tp, 0,
+                               XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+                       return -error;
+               }
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               /*
+                * Note - it's possible that we might have pushed ourselves out
+                * of the way during trans_reserve which would flush the inode.
+                * But there's no guarantee that the inode buffer has actually
+                * gone out yet (it's delwri).  Plus the buffer could be pinned
+                * anyway if it's part of an inode in another recent
+                * transaction.  So we play it safe and fire off the
+                * transaction anyway.
+                */
+               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+               xfs_trans_ihold(tp, ip);
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+               xfs_trans_set_sync(tp);
+               error = _xfs_trans_commit(tp, 0, &log_flushed);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       } else {
+               /*
+                * Timestamps/size haven't changed since last inode flush or
+                * inode transaction commit.  That means either nothing got
+                * written or a transaction committed which caught the updates.
+                * If the latter happened and the transaction hasn't hit the
+                * disk yet, the inode will be still be pinned.  If it is,
+                * force the log.
+                */
+               if (xfs_ipincount(ip)) {
+                       error = _xfs_log_force_lsn(ip->i_mount,
+                                       ip->i_itemp->ili_last_lsn,
+                                       XFS_LOG_SYNC, &log_flushed);
+               }
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       }
+       if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+               /*
+                * If the log write didn't issue an ordered tag we need
+                * to flush the disk cache for the data device now.
+                */
+               if (!log_flushed)
+                       xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+               /*
+                * If this inode is on the RT dev we need to flush that
+                * cache as well.
+                */
+               if (XFS_IS_REALTIME_INODE(ip))
+                       xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+       }
+       return -error;
+ }
++STATIC int
++xfs_file_fsync(
++      struct file             *file,
++      struct dentry           *dentry,
++      int                     datasync)
++{
++      return xfs_fsync(dentry->d_inode, datasync);
++}
++
++
  STATIC ssize_t
- xfs_file_aio_write(
+ xfs_file_aio_read(
        struct kiocb            *iocb,
-       const struct iovec      *iov,
+       const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos)
  {
@@@ -163,45 -938,6 +948,23 @@@ xfs_file_release
        return -xfs_release(XFS_I(inode));
  }
  
- /*
-  * We ignore the datasync flag here because a datasync is effectively
-  * identical to an fsync. That is, datasync implies that we need to write
-  * only the metadata needed to be able to access the data that is written
-  * if we crash after the call completes. Hence if we are writing beyond
-  * EOF we have to log the inode size change as well, which makes it a
-  * full fsync. If we don't write beyond EOF, the inode core will be
-  * clean in memory and so we don't need to log the inode, just like
-  * fsync.
-  */
- STATIC int
- xfs_file_fsync(
-       struct file             *file,
-       struct dentry           *dentry,
-       int                     datasync)
- {
-       struct xfs_inode        *ip = XFS_I(dentry->d_inode);
-       xfs_iflags_clear(ip, XFS_ITRUNCATED);
-       return -xfs_fsync(ip);
- }
 +#ifdef HAVE_DMAPI
 +STATIC int
 +xfs_vm_fault(
 +      struct vm_area_struct   *vma,
 +      struct vm_fault *vmf)
 +{
 +      struct inode    *inode = vma->vm_file->f_path.dentry->d_inode;
 +      struct xfs_mount *mp = XFS_M(inode->i_sb);
 +
 +      ASSERT_ALWAYS(mp->m_flags & XFS_MOUNT_DMAPI);
 +
 +      if (XFS_SEND_MMAP(mp, vma, 0))
 +              return VM_FAULT_SIGBUS;
 +      return filemap_fault(vma, vmf);
 +}
 +#endif /* HAVE_DMAPI */
 +
  STATIC int
  xfs_file_readdir(
        struct file     *filp,
Simple merge
@@@ -27,4 -27,4 +27,7 @@@ extern ssize_t xfs_vn_listxattr(struct 
  
  extern void xfs_setup_inode(struct xfs_inode *);
  
++extern int xfs_fsync(struct inode *, int);
++
  #endif /* __XFS_IOPS_H__ */
++
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc fs/xfs/xfs_rw.c
Simple merge
diff --cc fs/xfs/xfs_rw.h
Simple merge
@@@ -584,116 -584,11 +584,6 @@@ xfs_readlink
  }
  
  /*
-  * xfs_fsync
-  *
-  * This is called to sync the inode and its data out to disk.  We need to hold
-  * the I/O lock while flushing the data, and the inode lock while flushing the
-  * inode.  The inode lock CANNOT be held while flushing the data, so acquire
-  * after we're done with that.
-  */
- int
- xfs_fsync(
-       xfs_inode_t     *ip)
- {
-       xfs_trans_t     *tp;
-       int             error = 0;
-       int             log_flushed = 0, changed = 1;
-       xfs_itrace_entry(ip);
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return XFS_ERROR(EIO);
-       /*
-        * We always need to make sure that the required inode state is safe on
-        * disk.  The inode might be clean but we still might need to force the
-        * log because of committed transactions that haven't hit the disk yet.
-        * Likewise, there could be unflushed non-transactional changes to the
-        * inode core that have to go to disk and this requires us to issue
-        * a synchronous transaction to capture these changes correctly.
-        *
-        * This code relies on the assumption that if the update_* fields
-        * of the inode are clear and the inode is unpinned then it is clean
-        * and no action is required.
-        */
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       if (!ip->i_update_core) {
-               /*
-                * Timestamps/size haven't changed since last inode flush or
-                * inode transaction commit.  That means either nothing got
-                * written or a transaction committed which caught the updates.
-                * If the latter happened and the transaction hasn't hit the
-                * disk yet, the inode will be still be pinned.  If it is,
-                * force the log.
-                */
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-               if (xfs_ipincount(ip)) {
-                       error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-                                     XFS_LOG_FORCE | XFS_LOG_SYNC,
-                                     &log_flushed);
-               } else {
-                       /*
-                        * If the inode is not pinned and nothing has changed
-                        * we don't need to flush the cache.
-                        */
-                       changed = 0;
-               }
-       } else  {
-               /*
-                * Kick off a transaction to log the inode core to get the
-                * updates.  The sync transaction will also force the log.
-                */
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-               tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-               error = xfs_trans_reserve(tp, 0,
-                               XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       return error;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               /*
-                * Note - it's possible that we might have pushed ourselves out
-                * of the way during trans_reserve which would flush the inode.
-                * But there's no guarantee that the inode buffer has actually
-                * gone out yet (it's delwri).  Plus the buffer could be pinned
-                * anyway if it's part of an inode in another recent
-                * transaction.  So we play it safe and fire off the
-                * transaction anyway.
-                */
-               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-               xfs_trans_ihold(tp, ip);
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               xfs_trans_set_sync(tp);
-               error = _xfs_trans_commit(tp, 0, &log_flushed);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-       if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
-               /*
-                * If the log write didn't issue an ordered tag we need
-                * to flush the disk cache for the data device now.
-                */
-               if (!log_flushed)
-                       xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
-               /*
-                * If this inode is on the RT dev we need to flush that
-                * cache as well.
-                */
-               if (XFS_IS_REALTIME_INODE(ip))
-                       xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
-       }
-       return error;
- }
- /*
 - * Flags for xfs_free_eofblocks
 - */
 -#define XFS_FREE_EOF_TRYLOCK  (1<<0)
 -
 -/*
   * This is called by xfs_inactive to free any blocks beyond eof
   * when the link count isn't zero and by xfs_dm_punch_hole() when
   * punching a hole to EOF.
Simple merge
@@@ -1014,15 -1033,15 +1033,19 @@@ extern int blk_verify_command(unsigned 
  #define MAX_PHYS_SEGMENTS 128
  #define MAX_HW_SEGMENTS 128
  #define SAFE_MAX_SECTORS 255
+ #define MAX_SEGMENT_SIZE      65536
+ enum blk_default_limits {
+       BLK_MAX_SEGMENTS        = 128,
+       BLK_SAFE_MAX_SECTORS    = 255,
 +#ifndef CONFIG_KERNEL_DESKTOP
- #define BLK_DEF_MAX_SECTORS 2048
++      BLK_DEF_MAX_SECTORS     = 2048,
 +#else
- #define BLK_DEF_MAX_SECTORS 1024
+       BLK_DEF_MAX_SECTORS     = 1024,
 +#endif
- #define MAX_SEGMENT_SIZE      65536
- #define BLK_SEG_BOUNDARY_MASK 0xFFFFFFFFUL
+       BLK_MAX_SEGMENT_SIZE    = 65536,
+       BLK_SEG_BOUNDARY_MASK   = 0xFFFFFFFFUL,
+ };
  
  #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -381,16 -380,9 +380,12 @@@ struct sk_buff 
  #ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                    ndisc_nodetype:2;
  #endif
 +#ifdef        CONFIG_NETVM
 +      __u8                    emergency:1;
 +#endif
- #ifdef CONFIG_XEN
-       __u8                    proto_data_valid:1,
-                               proto_csum_blank:1;
- #endif
        kmemcheck_bitfield_end(flags2);
  
-       /* 0/9...15 bit hole */
+       /* 0/14 bit hole */
  
  #ifdef CONFIG_NET_DMA
        dma_cookie_t            dma_cookie;
Simple merge
@@@ -38,9 -38,6 +38,7 @@@ struct kmem_cache_cpu 
        void **freelist;        /* Pointer to first free per cpu object */
        struct page *page;      /* The slab from which we are allocating */
        int node;               /* The node of the page (or -1 for debug) */
-       unsigned int offset;    /* Freepointer offset (in word units) */
-       unsigned int objsize;   /* Size of an object (from kmem_cache) */
 +      int reserve;            /* Did the current page come from the reserve */
  #ifdef CONFIG_SLUB_STATS
        unsigned stat[NR_SLUB_STAT_ITEMS];
  #endif
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc init/Kconfig
@@@ -486,62 -461,8 +502,10 @@@ config LOG_BUF_SHIF
  config HAVE_UNSTABLE_SCHED_CLOCK
        bool
  
- config GROUP_SCHED
-       bool "Group CPU scheduler"
-       depends on EXPERIMENTAL
-       default n if KERNEL_DESKTOP
-       default y
-       help
-         This feature lets CPU scheduler recognize task groups and control CPU
-         bandwidth allocation to such task groups.
-         In order to create a group from arbitrary set of processes, use
-         CONFIG_CGROUPS. (See Control Group support.)
- config FAIR_GROUP_SCHED
-       bool "Group scheduling for SCHED_OTHER"
-       depends on GROUP_SCHED
-       default GROUP_SCHED
- config RT_GROUP_SCHED
-       bool "Group scheduling for SCHED_RR/FIFO"
-       depends on EXPERIMENTAL
-       depends on GROUP_SCHED
-       default n
-       help
-         This feature lets you explicitly allocate real CPU bandwidth
-         to users or control groups (depending on the "Basis for grouping tasks"
-         setting below. If enabled, it will also make it impossible to
-         schedule realtime tasks for non-root users until you allocate
-         realtime bandwidth for them.
-         See Documentation/scheduler/sched-rt-group.txt for more information.
- choice
-       depends on GROUP_SCHED
-       prompt "Basis for grouping tasks"
-       default USER_SCHED
- config USER_SCHED
-       bool "user id"
-       help
-         This option will choose userid as the basis for grouping
-         tasks, thus providing equal CPU bandwidth to each user.
- config CGROUP_SCHED
-       bool "Control groups"
-       depends on CGROUPS
-       help
-         This option allows you to create arbitrary task groups
-         using the "cgroup" pseudo filesystem and control
-         the cpu bandwidth allocated to each such task group.
-         Refer to Documentation/cgroups/cgroups.txt for more
-         information on "cgroup" pseudo filesystem.
- endchoice
  menuconfig CGROUPS
        boolean "Control Group support"
 +      default n if KERNEL_DESKTOP
 +      default y
        help
          This option adds support for grouping sets of processes together, for
          use with process control subsystems such as Cpusets, CFS, memory
@@@ -660,6 -581,36 +624,37 @@@ config CGROUP_MEM_RES_CTLR_SWA
          Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
          size is 4096bytes, 512k per 1Gbytes of swap.
  
+ menuconfig CGROUP_SCHED
+       bool "Group CPU scheduler"
+       depends on EXPERIMENTAL && CGROUPS
 -      default n
++      default n if KERNEL_DESKTOP
++      default y
+       help
+         This feature lets CPU scheduler recognize task groups and control CPU
+         bandwidth allocation to such task groups. It uses cgroups to group
+         tasks.
+ if CGROUP_SCHED
+ config FAIR_GROUP_SCHED
+       bool "Group scheduling for SCHED_OTHER"
+       depends on CGROUP_SCHED
+       default CGROUP_SCHED
+ config RT_GROUP_SCHED
+       bool "Group scheduling for SCHED_RR/FIFO"
+       depends on EXPERIMENTAL
+       depends on CGROUP_SCHED
+       default n
+       help
+         This feature lets you explicitly allocate real CPU bandwidth
+         to users or control groups (depending on the "Basis for grouping tasks"
+         setting below. If enabled, it will also make it impossible to
+         schedule realtime tasks for non-root users until you allocate
+         realtime bandwidth for them.
+         See Documentation/scheduler/sched-rt-group.txt for more information.
+ endif #CGROUP_SCHED
  endif # CGROUPS
  
  config MM_OWNER
@@@ -1155,25 -1093,8 +1137,25 @@@ config MMAP_ALLOW_UNINITIALIZE
  
          See Documentation/nommu-mmap.txt for more information.
  
 +config DEFAULT_VM_DIRTY_RATIO
 +      int "Default VM dirty ratio (in %)"
 +      default 20 if KERNEL_DESKTOP
 +      default 40
 +      help
 +        Allows to tune VM dirty ratio to suit different workloads. Increased
 +        VM dirty ratio improves performance of most server workloads that
 +        dirties a lot of memory (e.g. simple databases not using direct IO,
 +        workloads doing heavy writes). The latency-sensitive workloads like
 +        desktop and typical workstations perform better with a decreased
 +        VM dirty ratio.
 +
 +        Recommended value for desktop workload is 20.
 +        Recommended value for server workload is 40.
 +
 +        Only use this if you really know what you are doing.
 +
  config PROFILING
-       bool "Profiling support (EXPERIMENTAL)"
+       bool "Profiling support"
        help
          Say Y here to enable the extended profiling support mechanisms used
          by profilers such as OProfile.
diff --cc init/main.c
Simple merge
diff --cc ipc/mqueue.c
@@@ -152,8 -155,9 +155,9 @@@ static struct inode *mqueue_get_inode(s
                        spin_lock(&mq_lock);
                        if (u->mq_bytes + mq_bytes < u->mq_bytes ||
                            u->mq_bytes + mq_bytes >
 -                          p->signal->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
 +                          task_rlimit(p, RLIMIT_MSGQUEUE)) {
                                spin_unlock(&mq_lock);
+                               kfree(info->messages);
                                goto out_inode;
                        }
                        u->mq_bytes += mq_bytes;
index c34ac4f,0000000..64e9a17
mode 100644,000000..100644
--- /dev/null
@@@ -1,1041 -1,0 +1,1043 @@@
 +/*
 + * This file is subject to the terms and conditions of the GNU General Public
 + * License.  See the file "COPYING" in the main directory of this archive
 + * for more details.
 + *
 + * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
 + */
 +
 +#include <linux/blkdev.h>
 +#include <linux/types.h>
 +#include <linux/kdb.h>
 +#include <linux/kdbprivate.h>
 +#include <linux/module.h>
 +#include <linux/init.h>
 +#include <linux/mm.h>
 +#include <linux/swap.h>
 +#include <linux/swapops.h>
 +
 +#include <scsi/scsi.h>
 +#include <scsi/scsi_cmnd.h>
 +#include <scsi/scsi_device.h>
 +#include <scsi/scsi_host.h>
 +#include <asm/pgtable.h>
 +
 +MODULE_AUTHOR("SGI");
 +MODULE_DESCRIPTION("Debug VM information");
 +MODULE_LICENSE("GPL");
 +
 +struct __vmflags {
 +      unsigned long mask;
 +      char *name;
 +};
 +
 +static struct __vmflags vmflags[] = {
 +      { VM_READ, "VM_READ " },
 +      { VM_WRITE, "VM_WRITE " },
 +      { VM_EXEC, "VM_EXEC " },
 +      { VM_SHARED, "VM_SHARED " },
 +      { VM_MAYREAD, "VM_MAYREAD " },
 +      { VM_MAYWRITE, "VM_MAYWRITE " },
 +      { VM_MAYEXEC, "VM_MAYEXEC " },
 +      { VM_MAYSHARE, "VM_MAYSHARE " },
 +      { VM_GROWSDOWN, "VM_GROWSDOWN " },
 +      { VM_GROWSUP, "VM_GROWSUP " },
 +      { VM_PFNMAP, "VM_PFNMAP " },
 +      { VM_DENYWRITE, "VM_DENYWRITE " },
 +      { VM_EXECUTABLE, "VM_EXECUTABLE " },
 +      { VM_LOCKED, "VM_LOCKED " },
 +      { VM_IO, "VM_IO " },
 +      { VM_SEQ_READ, "VM_SEQ_READ " },
 +      { VM_RAND_READ, "VM_RAND_READ " },
 +      { VM_DONTCOPY, "VM_DONTCOPY " },
 +      { VM_DONTEXPAND, "VM_DONTEXPAND " },
 +      { VM_RESERVED, "VM_RESERVED " },
 +      { VM_ACCOUNT, "VM_ACCOUNT " },
 +      { VM_HUGETLB, "VM_HUGETLB " },
 +      { VM_NONLINEAR, "VM_NONLINEAR " },
 +      { VM_MAPPED_COPY, "VM_MAPPED_COPY " },
 +      { VM_INSERTPAGE, "VM_INSERTPAGE " },
 +      { 0, "" }
 +};
 +
 +static int
 +kdbm_print_vm(struct vm_area_struct *vp, unsigned long addr, int verbose_flg)
 +{
 +      struct __vmflags *tp;
 +
 +      kdb_printf("struct vm_area_struct at 0x%lx for %d bytes\n",
 +                 addr, (int) sizeof (struct vm_area_struct));
 +
 +      kdb_printf("vm_start = 0x%p   vm_end = 0x%p\n", (void *) vp->vm_start,
 +                 (void *) vp->vm_end);
 +      kdb_printf("vm_page_prot = 0x%llx\n",
 +              (unsigned long long)pgprot_val(vp->vm_page_prot));
 +
 +      kdb_printf("vm_flags: ");
 +      for (tp = vmflags; tp->mask; tp++) {
 +              if (vp->vm_flags & tp->mask) {
 +                      kdb_printf(" %s", tp->name);
 +              }
 +      }
 +      kdb_printf("\n");
 +
 +      if (!verbose_flg)
 +              return 0;
 +
 +      kdb_printf("vm_mm = 0x%p\n", (void *) vp->vm_mm);
 +      kdb_printf("vm_next = 0x%p\n", (void *) vp->vm_next);
 +      kdb_printf("shared.vm_set.list.next = 0x%p\n", (void *) vp->shared.vm_set.list.next);
 +      kdb_printf("shared.vm_set.list.prev = 0x%p\n", (void *) vp->shared.vm_set.list.prev);
 +      kdb_printf("shared.vm_set.parent = 0x%p\n", (void *) vp->shared.vm_set.parent);
 +      kdb_printf("shared.vm_set.head = 0x%p\n", (void *) vp->shared.vm_set.head);
-       kdb_printf("anon_vma_node.next = 0x%p\n", (void *) vp->anon_vma_node.next);
-       kdb_printf("anon_vma_node.prev = 0x%p\n", (void *) vp->anon_vma_node.prev);
++      kdb_printf("anon_vma_chain.next = 0x%p\n", (void *) vp->anon_vma_chain.next);
++      kdb_printf("anon_vma_chain.prev = 0x%p\n", (void *) vp->anon_vma_chain.prev);
 +      kdb_printf("vm_ops = 0x%p\n", (void *) vp->vm_ops);
 +      if (vp->vm_ops != NULL) {
 +              kdb_printf("vm_ops->open = 0x%p\n", vp->vm_ops->open);
 +              kdb_printf("vm_ops->close = 0x%p\n", vp->vm_ops->close);
 +              kdb_printf("vm_ops->fault = 0x%p\n", vp->vm_ops->fault);
 +#ifdef HAVE_VMOP_MPROTECT
 +              kdb_printf("vm_ops->mprotect = 0x%p\n", vp->vm_ops->mprotect);
 +#endif
 +#ifdef CONFIG_NUMA
 +              kdb_printf("vm_ops->set_policy = 0x%p\n", vp->vm_ops->set_policy);
 +              kdb_printf("vm_ops->get_policy = 0x%p\n", vp->vm_ops->get_policy);
 +#endif
 +      }
 +      kdb_printf("vm_pgoff = 0x%lx\n", vp->vm_pgoff);
 +      kdb_printf("vm_file = 0x%p\n", (void *) vp->vm_file);
 +      kdb_printf("vm_private_data = 0x%p\n", vp->vm_private_data);
 +#ifdef CONFIG_NUMA
 +      kdb_printf("vm_policy = 0x%p\n", vp->vm_policy);
 +#endif
 +
 +      return 0;
 +}
 +
 +static int
 +kdbm_print_vmp(struct vm_area_struct *vp, int verbose_flg)
 +{
 +      struct __vmflags *tp;
 +
 +      if (verbose_flg) {
 +              kdb_printf("0x%lx:  ", (unsigned long) vp);
 +      }
 +
 +      kdb_printf("0x%p  0x%p ", (void *) vp->vm_start, (void *) vp->vm_end);
 +
 +      for (tp = vmflags; tp->mask; tp++) {
 +              if (vp->vm_flags & tp->mask) {
 +                      kdb_printf(" %s", tp->name);
 +              }
 +      }
 +      kdb_printf("\n");
 +
 +      return 0;
 +}
 +
 +
 +#ifdef CONFIG_NUMA
 +#include <linux/mempolicy.h>
 +
 +/*
 + * kdbm_mpol
 + *
 + *    This function implements the 'mempolicy' command.
 + *    Print a struct mempolicy.
 + *
 + *    mempolicy <address>     Print struct mempolicy at <address>
 + */
 +static int
 +kdbm_mpol(int argc, const char **argv)
 +{
 +      unsigned long addr;
 +      long offset = 0;
 +      int nextarg;
 +      int err = 0;
 +      struct mempolicy *mp = NULL;
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((err = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
 +                              NULL)) != 0)
 +              return(err);
 +
 +      if (!(mp = kmalloc(sizeof(*mp), GFP_ATOMIC))) {
 +              kdb_printf("%s: cannot kmalloc mp\n", __FUNCTION__);
 +              goto out;
 +      }
 +
 +      if ((err = kdb_getarea(*mp, addr))) {
 +              kdb_printf("%s: invalid mempolicy address\n", __FUNCTION__);
 +              goto out;
 +      }
 +
 +      kdb_printf("struct mempolicy at 0x%p\n", (struct mempolicy *)addr);
 +      kdb_printf("  refcnt %d\n", atomic_read(&mp->refcnt));
 +
 +      switch (mp->mode) {
 +        case MPOL_DEFAULT:
 +              kdb_printf("  mode %d (MPOL_DEFAULT)\n", mp->mode);
 +              break;
 +
 +        case MPOL_PREFERRED:
 +              kdb_printf("  mode %d (MPOL_PREFERRED)\n", mp->mode);
 +              if (mp->flags & MPOL_F_LOCAL)
 +                      kdb_printf("  preferred_node local\n");
 +              else
 +                      kdb_printf("  preferred_node %d\n", mp->v.preferred_node);
 +              break;
 +
 +        case MPOL_BIND:
 +        case MPOL_INTERLEAVE:
 +        {
 +              int i, nlongs;
 +              unsigned long *longp;
 +
 +              kdb_printf("  mode %d (%s)\n", mp->mode,
 +                      mp->mode == MPOL_INTERLEAVE
 +                              ? "MPOL_INTERLEAVE"
 +                              : "MPOL_BIND");
 +              nlongs = (int)BITS_TO_LONGS(MAX_NUMNODES);
 +              kdb_printf("  nodes:");
 +              longp = mp->v.nodes.bits;
 +              for (i = 0; i < nlongs; i++, longp++)
 +                      kdb_printf("  0x%lx ", *longp);
 +              kdb_printf("\n");
 +              break;
 +        }
 +
 +        default:
 +              kdb_printf("  mode %d (unknown)\n", mp->mode);
 +              break;
 +      }
 +out:
 +      if (mp)
 +              kfree(mp);
 +      return err;
 +}
 +
 +#endif /* CONFIG_NUMA */
 +
 +/*
 + * kdbm_pgdat
 + *
 + *    This function implements the 'pgdat' command.
 + *    Print a struct pglist_data (pg_dat_t).
 + *
 + *    pgdat <node_id>         Print struct pglist_data for node <node_id>.
 + *
 + *    Print pglist_data for node 0 if node_id not specified,
 + *    or print the one pglist_data structure if !CONFIG_NUMA.
 + */
 +static int
 +kdbm_pgdat(int argc, const char **argv)
 +{
 +      int err = 0, node_id = 0, i;
 +      pg_data_t *pgdatp = NULL;
 +
 +#ifdef CONFIG_NUMA
 +      if (argc > 1)
 +              return KDB_ARGCOUNT;
 +      if (argc == 1) {
 +              int nextarg;
 +              long offset = 0;
 +              unsigned long node_id_ul;
 +
 +              nextarg = 1;
 +              if ((err = kdbgetaddrarg(argc, argv, &nextarg, &node_id_ul,
 +                                       &offset, NULL)) != 0) {
 +                      return(err);
 +              }
 +              node_id = (int)node_id_ul;
 +      }
 +#endif
 +      for_each_online_pgdat(pgdatp) {
 +              if (pgdatp->node_id == node_id)
 +                      break;
 +      }
 +      if (!pgdatp) {
 +              kdb_printf("%s: specified node not found\n", __FUNCTION__);
 +              return 0;
 +      }
 +      kdb_printf("struct pglist_data at 0x%p  node_id = %d\n",
 +                 pgdatp, pgdatp->node_id);
 +
 +      for (i = 0; i < MAX_ZONELISTS; i++) {
 +              int zr;
 +              struct zoneref *zonerefp;
 +              struct zone *zonep;
 +
 +              zonerefp = pgdatp->node_zonelists[i]._zonerefs;
 +              kdb_printf("  _zonerefs[%d] at 0x%p\n", i, zonerefp);
 +
 +              for (zr = 0; zr <= MAX_ZONES_PER_ZONELIST; zr++, zonerefp++) {
 +                      int z;
 +                      pg_data_t *tmp_pgdatp;
 +
 +                      zonep = zonelist_zone(zonerefp);
 +                      if (!zonep)
 +                              break;
 +
 +                      kdb_printf("    0x%p", zonep);
 +
 +                      for_each_online_pgdat(tmp_pgdatp) {
 +                              for (z = 0; z < MAX_NR_ZONES; z++) {
 +                                      if (zonep == &tmp_pgdatp->node_zones[z]) {
 +                                              kdb_printf ("  (node %d node_zones[%d])",
 +                                                   tmp_pgdatp->node_id, z);
 +                                              break;
 +                                      }
 +                              }
 +                              if (z != MAX_NR_ZONES)
 +                                      break;  /* found it */
 +                      }
 +                      kdb_printf("\n");
 +              }
 +      }
 +
 +      kdb_printf("  nr_zones = %d", pgdatp->nr_zones);
 +#ifdef CONFIG_FLAT_NODE_MEM_MAP
 +      kdb_printf("  node_mem_map = 0x%p\n", pgdatp->node_mem_map);
 +#endif
++#ifndef CONFIG_NO_BOOTMEM
 +      kdb_printf("  bdata = 0x%p", pgdatp->bdata);
++#endif
 +      kdb_printf("  node_start_pfn = 0x%lx\n", pgdatp->node_start_pfn);
 +      kdb_printf("  node_present_pages = %ld (0x%lx)\n",
 +                 pgdatp->node_present_pages, pgdatp->node_present_pages);
 +      kdb_printf("  node_spanned_pages = %ld (0x%lx)\n",
 +                 pgdatp->node_spanned_pages, pgdatp->node_spanned_pages);
 +      kdb_printf("  kswapd = 0x%p\n", pgdatp->kswapd);
 +
 +      return err;
 +}
 +
 +/*
 + * kdbm_vm
 + *
 + *     This function implements the 'vm' command.  Print a vm_area_struct.
 + *
 + *     vm [-v] <address>      Print vm_area_struct at <address>
 + *     vmp [-v] <pid>         Print all vm_area_structs for <pid>
 + */
 +
 +static int
 +kdbm_vm(int argc, const char **argv)
 +{
 +      unsigned long addr;
 +      long offset = 0;
 +      int nextarg;
 +      int diag;
 +      int verbose_flg = 0;
 +
 +      if (argc == 2) {
 +              if (strcmp(argv[1], "-v") != 0) {
 +                      return KDB_ARGCOUNT;
 +              }
 +              verbose_flg = 1;
 +      } else if (argc != 1) {
 +              return KDB_ARGCOUNT;
 +      }
 +
 +      if (strcmp(argv[0], "vmp") == 0) {
 +              struct task_struct *g, *tp;
 +              struct vm_area_struct *vp;
 +              pid_t pid;
 +
 +              if ((diag = kdbgetularg(argv[argc], (unsigned long *) &pid)))
 +                      return diag;
 +
 +              kdb_do_each_thread(g, tp) {
 +                      if (tp->pid == pid) {
 +                              if (tp->mm != NULL) {
 +                                      if (verbose_flg)
 +                                              kdb_printf
 +                                                  ("vm_area_struct       ");
 +                                      kdb_printf
 +                                          ("vm_start            vm_end              vm_flags\n");
 +                                      vp = tp->mm->mmap;
 +                                      while (vp != NULL) {
 +                                              kdbm_print_vmp(vp, verbose_flg);
 +                                              vp = vp->vm_next;
 +                                      }
 +                              }
 +                              return 0;
 +                      }
 +              } kdb_while_each_thread(g, tp);
 +
 +              kdb_printf("No process with pid == %d found\n", pid);
 +
 +      } else {
 +              struct vm_area_struct v;
 +
 +              nextarg = argc;
 +              if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
 +                                        NULL))
 +                  || (diag = kdb_getarea(v, addr)))
 +                      return (diag);
 +
 +              kdbm_print_vm(&v, addr, verbose_flg);
 +      }
 +
 +      return 0;
 +}
 +
 +static int
 +kdbm_print_pte(pte_t * pte)
 +{
 +      kdb_printf("0x%lx (", (unsigned long) pte_val(*pte));
 +
 +      if (pte_present(*pte)) {
 +#ifdef        pte_exec
 +              if (pte_exec(*pte))
 +                      kdb_printf("X");
 +#endif
 +              if (pte_write(*pte))
 +                      kdb_printf("W");
 +#ifdef        pte_read
 +              if (pte_read(*pte))
 +                      kdb_printf("R");
 +#endif
 +              if (pte_young(*pte))
 +                      kdb_printf("A");
 +              if (pte_dirty(*pte))
 +                      kdb_printf("D");
 +
 +      } else {
 +              kdb_printf("OFFSET=0x%lx ", swp_offset(pte_to_swp_entry(*pte)));
 +              kdb_printf("TYPE=0x%ulx", swp_type(pte_to_swp_entry(*pte)));
 +      }
 +
 +      kdb_printf(")");
 +
 +      /* final newline is output by caller of kdbm_print_pte() */
 +
 +      return 0;
 +}
 +
 +/*
 + * kdbm_pte
 + *
 + *     This function implements the 'pte' command.  Print all pte_t structures
 + *     that map to the given virtual address range (<address> through <address>
 + *     plus <nbytes>) for the given process. The default value for nbytes is
 + *     one.
 + *
 + *     pte -m <mm> <address> [<nbytes>]    Print all pte_t structures for
 + *                                       virtual <address> in address space
 + *                                       of <mm> which is a pointer to a
 + *                                       mm_struct
 + *     pte -p <pid> <address> [<nbytes>]   Print all pte_t structures for
 + *                                       virtual <address> in address space
 + *                                       of <pid>
 + */
 +
 +static int
 +kdbm_pte(int argc, const char **argv)
 +{
 +      unsigned long addr;
 +      long offset = 0;
 +      int nextarg;
 +      unsigned long nbytes = 1;
 +      long npgs;
 +      int diag;
 +      int found;
 +      pid_t pid;
 +      struct task_struct *tp;
 +      struct mm_struct *mm, copy_of_mm;
 +      pgd_t *pgd;
 +      pud_t *pud;
 +      pmd_t *pmd;
 +      pte_t *pte;
 +
 +      if (argc < 3 || argc > 4) {
 +              return KDB_ARGCOUNT;
 +      }
 +
 +       if (strcmp(argv[1], "-p") == 0) {
 +              if ((diag = kdbgetularg(argv[2], (unsigned long *) &pid))) {
 +                      return diag;
 +              }
 +
 +              found = 0;
 +              for_each_process(tp) {
 +                      if (tp->pid == pid) {
 +                              if (tp->mm != NULL) {
 +                                      found = 1;
 +                                      break;
 +                              }
 +                              kdb_printf("task structure's mm field is NULL\n");
 +                              return 0;
 +                      }
 +              }
 +
 +              if (!found) {
 +                      kdb_printf("No process with pid == %d found\n", pid);
 +                      return 0;
 +              }
 +              mm = tp->mm;
 +      } else if (strcmp(argv[1], "-m") == 0) {
 +
 +
 +              nextarg = 2;
 +              if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
 +                                        NULL))
 +                  || (diag = kdb_getarea(copy_of_mm, addr)))
 +                      return (diag);
 +              mm = &copy_of_mm;
 +      } else {
 +              return KDB_ARGCOUNT;
 +      }
 +
 +      if ((diag = kdbgetularg(argv[3], &addr))) {
 +              return diag;
 +      }
 +
 +      if (argc == 4) {
 +              if ((diag = kdbgetularg(argv[4], &nbytes))) {
 +                      return diag;
 +              }
 +      }
 +
 +      kdb_printf("vaddr              pte\n");
 +
 +      npgs = ((((addr & ~PAGE_MASK) + nbytes) + ~PAGE_MASK) >> PAGE_SHIFT);
 +      while (npgs-- > 0) {
 +
 +              kdb_printf("0x%p ", (void *) (addr & PAGE_MASK));
 +
 +              pgd = pgd_offset(mm, addr);
 +              if (pgd_present(*pgd)) {
 +                      pud = pud_offset(pgd, addr);
 +                      if (pud_present(*pud)) {
 +                              pmd = pmd_offset(pud, addr);
 +                              if (pmd_present(*pmd)) {
 +                                      pte = pte_offset_map(pmd, addr);
 +                                      if (pte_present(*pte)) {
 +                                              kdbm_print_pte(pte);
 +                                      }
 +                              }
 +                      }
 +              }
 +
 +              kdb_printf("\n");
 +              addr += PAGE_SIZE;
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * kdbm_rpte
 + *
 + *     This function implements the 'rpte' command.  Print all pte_t structures
 + *     that contain the given physical page range (<pfn> through <pfn>
 + *     plus <npages>) for the given process. The default value for npages is
 + *     one.
 + *
 + *     rpte -m <mm> <pfn> [<npages>]     Print all pte_t structures for
 + *                                       physical page <pfn> in address space
 + *                                       of <mm> which is a pointer to a
 + *                                       mm_struct
 + *     rpte -p <pid> <pfn> [<npages>]    Print all pte_t structures for
 + *                                       physical page <pfn> in address space
 + *                                       of <pid>
 + */
 +
 +static int
 +kdbm_rpte(int argc, const char **argv)
 +{
 +      unsigned long addr;
 +      unsigned long pfn;
 +      long offset = 0;
 +      int nextarg;
 +      unsigned long npages = 1;
 +      int diag;
 +      int found;
 +      pid_t pid;
 +      struct task_struct *tp;
 +      struct mm_struct *mm, copy_of_mm;
 +      pgd_t *pgd;
 +      pud_t *pud;
 +      pmd_t *pmd;
 +      pte_t *pte;
 +      unsigned long g, u, m, t;
 +
 +      if (argc < 3 || argc > 4) {
 +              return KDB_ARGCOUNT;
 +      }
 +
 +       if (strcmp(argv[1], "-p") == 0) {
 +              if ((diag = kdbgetularg(argv[2], (unsigned long *) &pid))) {
 +                      return diag;
 +              }
 +
 +              found = 0;
 +              for_each_process(tp) {
 +                      if (tp->pid == pid) {
 +                              if (tp->mm != NULL) {
 +                                      found = 1;
 +                                      break;
 +                              }
 +                              kdb_printf("task structure's mm field is NULL\n");
 +                              return 0;
 +                      }
 +              }
 +
 +              if (!found) {
 +                      kdb_printf("No process with pid == %d found\n", pid);
 +                      return 0;
 +              }
 +              mm = tp->mm;
 +      } else if (strcmp(argv[1], "-m") == 0) {
 +
 +
 +              nextarg = 2;
 +              if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
 +                                        NULL))
 +                  || (diag = kdb_getarea(copy_of_mm, addr)))
 +                      return (diag);
 +              mm = &copy_of_mm;
 +      } else {
 +              return KDB_ARGCOUNT;
 +      }
 +
 +      if ((diag = kdbgetularg(argv[3], &pfn))) {
 +              return diag;
 +      }
 +
 +      if (argc == 4) {
 +              if ((diag = kdbgetularg(argv[4], &npages))) {
 +                      return diag;
 +              }
 +      }
 +
 +      /* spaces after vaddr depends on sizeof(unsigned long) */
 +      kdb_printf("pfn              vaddr%*s pte\n",
 +                 (int)(2*sizeof(unsigned long) + 2 - 5), " ");
 +
 +      for (g = 0, pgd = pgd_offset(mm, 0UL); g < PTRS_PER_PGD; ++g, ++pgd) {
 +              if (pgd_none(*pgd) || pgd_bad(*pgd))
 +                      continue;
 +              for (u = 0, pud = pud_offset(pgd, 0UL); u < PTRS_PER_PUD; ++u, ++pud) {
 +                      if (pud_none(*pud) || pud_bad(*pud))
 +                              continue;
 +                      for (m = 0, pmd = pmd_offset(pud, 0UL); m < PTRS_PER_PMD; ++m, ++pmd) {
 +                              if (pmd_none(*pmd) || pmd_bad(*pmd))
 +                                      continue;
 +                              for (t = 0, pte = pte_offset_map(pmd, 0UL); t < PTRS_PER_PTE; ++t, ++pte) {
 +                                      if (pte_none(*pte))
 +                                              continue;
 +                                      if (pte_pfn(*pte) < pfn || pte_pfn(*pte) >= (pfn + npages))
 +                                              continue;
 +                                      addr = g << PGDIR_SHIFT;
 +#ifdef __ia64__
 +                                      /* IA64 plays tricks with the pgd mapping to save space.
 +                                       * This reverses pgd_index().
 +                                       */
 +                                      {
 +                                              unsigned long region = g >> (PAGE_SHIFT - 6);
 +                                              unsigned long l1index = g - (region << (PAGE_SHIFT - 6));
 +                                              addr = (region << 61) + (l1index << PGDIR_SHIFT);
 +                                      }
 +#endif
 +                                      addr += (m << PMD_SHIFT) + (t << PAGE_SHIFT);
 +                                      kdb_printf("0x%-14lx " kdb_bfd_vma_fmt0 " ",
 +                                                 pte_pfn(*pte), addr);
 +                                      kdbm_print_pte(pte);
 +                                      kdb_printf("\n");
 +                              }
 +                      }
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +static int
 +kdbm_print_dentry(unsigned long daddr)
 +{
 +      struct dentry d;
 +      int diag;
 +      char buf[256];
 +
 +      kdb_printf("Dentry at 0x%lx\n", daddr);
 +      if ((diag = kdb_getarea(d, (unsigned long)daddr)))
 +              return diag;
 +
 +      if ((d.d_name.len > sizeof(buf)) || (diag = kdb_getarea_size(buf, (unsigned long)(d.d_name.name), d.d_name.len)))
 +              kdb_printf(" d_name.len = %d d_name.name = 0x%p\n",
 +                                      d.d_name.len, d.d_name.name);
 +      else
 +              kdb_printf(" d_name.len = %d d_name.name = 0x%p <%.*s>\n",
 +                                      d.d_name.len, d.d_name.name,
 +                                      (int)(d.d_name.len), d.d_name.name);
 +
 +      kdb_printf(" d_count = %d d_flags = 0x%x d_inode = 0x%p\n",
 +                                      atomic_read(&d.d_count), d.d_flags, d.d_inode);
 +
 +      kdb_printf(" d_parent = 0x%p\n", d.d_parent);
 +
 +      kdb_printf(" d_hash.nxt = 0x%p d_hash.prv = 0x%p\n",
 +                                      d.d_hash.next, d.d_hash.pprev);
 +
 +      kdb_printf(" d_lru.nxt = 0x%p d_lru.prv = 0x%p\n",
 +                                      d.d_lru.next, d.d_lru.prev);
 +
 +      kdb_printf(" d_child.nxt = 0x%p d_child.prv = 0x%p\n",
 +                                      d.d_u.d_child.next, d.d_u.d_child.prev);
 +
 +      kdb_printf(" d_subdirs.nxt = 0x%p d_subdirs.prv = 0x%p\n",
 +                                      d.d_subdirs.next, d.d_subdirs.prev);
 +
 +      kdb_printf(" d_alias.nxt = 0x%p d_alias.prv = 0x%p\n",
 +                                      d.d_alias.next, d.d_alias.prev);
 +
 +      kdb_printf(" d_op = 0x%p d_sb = 0x%p d_fsdata = 0x%p\n",
 +                                      d.d_op, d.d_sb, d.d_fsdata);
 +
 +      kdb_printf(" d_iname = %s\n",
 +                                      d.d_iname);
 +
 +      if (d.d_inode) {
 +              struct inode i;
 +              kdb_printf("\nInode Entry at 0x%p\n", d.d_inode);
 +              if ((diag = kdb_getarea(i, (unsigned long)d.d_inode)))
 +                      return diag;
 +              kdb_printf(" i_mode = 0%o  i_nlink = %d  i_rdev = 0x%x\n",
 +                                              i.i_mode, i.i_nlink, i.i_rdev);
 +
 +              kdb_printf(" i_ino = %ld i_count = %d\n",
 +                                              i.i_ino, atomic_read(&i.i_count));
 +
 +              kdb_printf(" i_hash.nxt = 0x%p i_hash.prv = 0x%p\n",
 +                                              i.i_hash.next, i.i_hash.pprev);
 +
 +              kdb_printf(" i_list.nxt = 0x%p i_list.prv = 0x%p\n",
 +                                              i.i_list.next, i.i_list.prev);
 +
 +              kdb_printf(" i_dentry.nxt = 0x%p i_dentry.prv = 0x%p\n",
 +                                              i.i_dentry.next, i.i_dentry.prev);
 +
 +      }
 +      kdb_printf("\n");
 +      return 0;
 +}
 +
 +static int
 +kdbm_filp(int argc, const char **argv)
 +{
 +      struct file   f;
 +      int nextarg;
 +      unsigned long addr;
 +      long offset;
 +      int diag;
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
 +          (diag = kdb_getarea(f, addr)))
 +              return diag;
 +
 +      kdb_printf("File Pointer at 0x%lx\n", addr);
 +
 +      kdb_printf(" fu_list.nxt = 0x%p fu_list.prv = 0x%p\n",
 +                      f.f_u.fu_list.next, f.f_u.fu_list.prev);
 +
 +      kdb_printf(" f_dentry = 0x%p f_vfsmnt = 0x%p f_op = 0x%p\n",
 +                      f.f_dentry, f.f_vfsmnt, f.f_op);
 +
 +      kdb_printf(" f_count = %ld f_flags = 0x%x f_mode = 0x%x\n",
-                       f.f_count, f.f_flags, f.f_mode);
++                      atomic_long_read(&f.f_count), f.f_flags, f.f_mode);
 +
 +      kdb_printf(" f_pos = %Ld\n", f.f_pos);
 +#ifdef        CONFIG_SECURITY
 +      kdb_printf(" security = 0x%p\n", f.f_security);
 +#endif
 +
 +      kdb_printf(" private_data = 0x%p f_mapping = 0x%p\n\n",
 +                                      f.private_data, f.f_mapping);
 +
 +      return kdbm_print_dentry((unsigned long)f.f_dentry);
 +}
 +
 +static int
 +kdbm_fl(int argc, const char **argv)
 +{
 +      struct file_lock fl;
 +      int nextarg;
 +      unsigned long addr;
 +      long offset;
 +      int diag;
 +
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
 +              (diag = kdb_getarea(fl, addr)))
 +                      return diag;
 +
 +      kdb_printf("File_lock at 0x%lx\n", addr);
 +
 +      kdb_printf(" fl_next = 0x%p fl_link.nxt = 0x%p fl_link.prv = 0x%p\n",
 +                      fl.fl_next, fl.fl_link.next, fl.fl_link.prev);
 +      kdb_printf(" fl_block.nxt = 0x%p fl_block.prv = 0x%p\n",
 +                      fl.fl_block.next, fl.fl_block.prev);
 +      kdb_printf(" fl_owner = 0x%p fl_pid = %d fl_wait = 0x%p\n",
 +                      fl.fl_owner, fl.fl_pid, &fl.fl_wait);
 +      kdb_printf(" fl_file = 0x%p fl_flags = 0x%x\n",
 +                      fl.fl_file, fl.fl_flags);
 +      kdb_printf(" fl_type = %d fl_start = 0x%llx fl_end = 0x%llx\n",
 +                      fl.fl_type, fl.fl_start, fl.fl_end);
 +
 +      kdb_printf(" file_lock_operations");
 +      if (fl.fl_ops)
 +              kdb_printf("\n   fl_copy_lock = 0x%p fl_release_private = 0x%p\n",
 +                      fl.fl_ops->fl_copy_lock, fl.fl_ops->fl_release_private);
 +      else
 +              kdb_printf("   empty\n");
 +
 +      kdb_printf(" lock_manager_operations");
 +      if (fl.fl_lmops)
 +              kdb_printf("\n   fl_compare_owner = 0x%p fl_notify = 0x%p\n",
 +                      fl.fl_lmops->fl_compare_owner, fl.fl_lmops->fl_notify);
 +      else
 +              kdb_printf("   empty\n");
 +
 +      kdb_printf(" fl_fasync = 0x%p fl_break 0x%lx\n",
 +                      fl.fl_fasync, fl.fl_break_time);
 +
 +      return 0;
 +}
 +
 +
 +static int
 +kdbm_dentry(int argc, const char **argv)
 +{
 +      int nextarg;
 +      unsigned long addr;
 +      long offset;
 +      int diag;
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
 +              return diag;
 +
 +      return kdbm_print_dentry(addr);
 +}
 +
 +static int
 +kdbm_kobject(int argc, const char **argv)
 +{
 +      struct kobject k;
 +      int nextarg;
 +      unsigned long addr;
 +      long offset;
 +      int diag;
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
 +          (diag = kdb_getarea(k, addr)))
 +              return diag;
 +
 +
 +      kdb_printf("kobject at 0x%lx\n", addr);
 +
 +      if (k.name) {
 +              char c;
 +              kdb_printf(" name 0x%p", k.name);
 +              if (kdb_getarea(c, (unsigned long)k.name) == 0)
 +                      kdb_printf(" '%s'", k.name);
 +              kdb_printf("\n");
 +      }
 +
 +      if (k.name != kobject_name((struct kobject *)addr))
 +              kdb_printf(" name '%.20s'\n", k.name);
 +
 +      kdb_printf(" kref.refcount %d'\n", atomic_read(&k.kref.refcount));
 +
 +      kdb_printf(" entry.next = 0x%p entry.prev = 0x%p\n",
 +                                      k.entry.next, k.entry.prev);
 +
 +      kdb_printf(" parent = 0x%p kset = 0x%p ktype = 0x%p sd = 0x%p\n",
 +                                      k.parent, k.kset, k.ktype, k.sd);
 +
 +      return 0;
 +}
 +
 +static int
 +kdbm_sh(int argc, const char **argv)
 +{
 +      int diag;
 +      int nextarg;
 +      unsigned long addr;
 +      long offset = 0L;
 +      struct Scsi_Host sh;
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
 +          (diag = kdb_getarea(sh, addr)))
 +              return diag;
 +
 +      kdb_printf("Scsi_Host at 0x%lx\n", addr);
 +      kdb_printf("host_queue = 0x%p\n", sh.__devices.next);
 +      kdb_printf("ehandler = 0x%p eh_action = 0x%p\n",
 +                 sh.ehandler, sh.eh_action);
 +      kdb_printf("host_wait = 0x%p hostt = 0x%p\n",
 +                 &sh.host_wait, sh.hostt);
 +      kdb_printf("host_failed = %d  host_no = %d resetting = %d\n",
 +                 sh.host_failed, sh.host_no, sh.resetting);
 +      kdb_printf("max id/lun/channel = [%d/%d/%d]  this_id = %d\n",
 +                 sh.max_id, sh.max_lun, sh.max_channel, sh.this_id);
 +      kdb_printf("can_queue = %d cmd_per_lun = %d  sg_tablesize = %d u_isa_dma = %d\n",
 +                 sh.can_queue, sh.cmd_per_lun, sh.sg_tablesize, sh.unchecked_isa_dma);
 +      kdb_printf("host_blocked = %d  reverse_ordering = %d \n",
 +                 sh.host_blocked, sh.reverse_ordering);
 +
 +      return 0;
 +}
 +
 +static int
 +kdbm_sd(int argc, const char **argv)
 +{
 +      int diag;
 +      int nextarg;
 +      unsigned long addr;
 +      long offset = 0L;
 +      struct scsi_device *sd = NULL;
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
 +              goto out;
 +      if (!(sd = kmalloc(sizeof(*sd), GFP_ATOMIC))) {
 +              kdb_printf("kdbm_sd: cannot kmalloc sd\n");
 +              goto out;
 +      }
 +      if ((diag = kdb_getarea(*sd, addr)))
 +              goto out;
 +
 +      kdb_printf("scsi_device at 0x%lx\n", addr);
 +      kdb_printf("next = 0x%p   prev = 0x%p  host = 0x%p\n",
 +                 sd->siblings.next, sd->siblings.prev, sd->host);
 +      kdb_printf("device_busy = %d   current_cmnd 0x%p\n",
 +                 sd->device_busy, sd->current_cmnd);
 +      kdb_printf("id/lun/chan = [%d/%d/%d]  single_lun = %d  device_blocked = %d\n",
 +                 sd->id, sd->lun, sd->channel, sd->sdev_target->single_lun, sd->device_blocked);
 +      kdb_printf("queue_depth = %d current_tag = %d  scsi_level = %d\n",
 +                 sd->queue_depth, sd->current_tag, sd->scsi_level);
 +      kdb_printf("%8.8s %16.16s %4.4s\n", sd->vendor, sd->model, sd->rev);
 +out:
 +      if (sd)
 +              kfree(sd);
 +      return diag;
 +}
 +
 +static int
 +kdbm_sc(int argc, const char **argv)
 +{
 +      int diag;
 +      int nextarg;
 +      unsigned long addr;
 +      long offset = 0L;
 +      struct scsi_cmnd *sc = NULL;
 +
 +      if (argc != 1)
 +              return KDB_ARGCOUNT;
 +
 +      nextarg = 1;
 +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
 +              goto out;
 +      if (!(sc = kmalloc(sizeof(*sc), GFP_ATOMIC))) {
 +              kdb_printf("kdbm_sc: cannot kmalloc sc\n");
 +              goto out;
 +      }
 +      if ((diag = kdb_getarea(*sc, addr)))
 +              goto out;
 +
 +      kdb_printf("scsi_cmnd at 0x%lx\n", addr);
 +      kdb_printf("device = 0x%p  next = 0x%p\n",
 +                 sc->device, sc->list.next);
 +      kdb_printf("serial_number = %ld  retries = %d\n",
 +                 sc->serial_number, sc->retries);
 +      kdb_printf("cmd_len = %d\n", sc->cmd_len);
 +      kdb_printf("cmnd = [%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x]\n",
 +                 sc->cmnd[0], sc->cmnd[1], sc->cmnd[2], sc->cmnd[3], sc->cmnd[4],
 +                 sc->cmnd[5], sc->cmnd[6], sc->cmnd[7], sc->cmnd[8], sc->cmnd[9],
 +                 sc->cmnd[10], sc->cmnd[11]);
 +      kdb_printf("request_buffer = 0x%p  request_bufflen = %d\n",
 +                 scsi_sglist(sc), scsi_bufflen(sc));
 +      kdb_printf("use_sg = %d\n", scsi_sg_count(sc));
 +      kdb_printf("underflow = %d transfersize = %d\n",
 +                 sc->underflow, sc->transfersize);
 +      kdb_printf("tag = %d\n", sc->tag);
 +
 +out:
 +      if (sc)
 +              kfree(sc);
 +      return diag;
 +}
 +
 +static int __init kdbm_vm_init(void)
 +{
 +      kdb_register("vm", kdbm_vm, "[-v] <vaddr>", "Display vm_area_struct", 0);
 +      kdb_register("vmp", kdbm_vm, "[-v] <pid>", "Display all vm_area_struct for <pid>", 0);
 +#ifdef CONFIG_NUMA
 +      kdb_register("mempolicy", kdbm_mpol, "<vaddr>", "Display mempolicy structure", 0);
 +      kdb_register("pgdat", kdbm_pgdat, "<node_id>", "Display pglist_data node structure", 0);
 +#else
 +      kdb_register("pgdat", kdbm_pgdat, "", "Display pglist_data node structure", 0);
 +#endif
 +      kdb_register("pte", kdbm_pte, "( -m <mm> | -p <pid> ) <vaddr> [<nbytes>]", "Display pte_t for mm_struct or pid", 0);
 +      kdb_register("rpte", kdbm_rpte, "( -m <mm> | -p <pid> ) <pfn> [<npages>]", "Find pte_t containing pfn for mm_struct or pid", 0);
 +      kdb_register("dentry", kdbm_dentry, "<dentry>", "Display interesting dentry stuff", 0);
 +      kdb_register("kobject", kdbm_kobject, "<kobject>", "Display interesting kobject stuff", 0);
 +      kdb_register("filp", kdbm_filp, "<filp>", "Display interesting filp stuff", 0);
 +      kdb_register("fl", kdbm_fl, "<fl>", "Display interesting file_lock stuff", 0);
 +      kdb_register("sh", kdbm_sh, "<vaddr>", "Show scsi_host", 0);
 +      kdb_register("sd", kdbm_sd, "<vaddr>", "Show scsi_device", 0);
 +      kdb_register("sc", kdbm_sc, "<vaddr>", "Show scsi_cmnd", 0);
 +
 +      return 0;
 +}
 +
 +static void __exit kdbm_vm_exit(void)
 +{
 +      kdb_unregister("vm");
 +      kdb_unregister("vmp");
 +#ifdef CONFIG_NUMA
 +      kdb_unregister("mempolicy");
 +#endif
 +      kdb_unregister("pgdat");
 +      kdb_unregister("pte");
 +      kdb_unregister("rpte");
 +      kdb_unregister("dentry");
 +      kdb_unregister("kobject");
 +      kdb_unregister("filp");
 +      kdb_unregister("fl");
 +      kdb_unregister("sh");
 +      kdb_unregister("sd");
 +      kdb_unregister("sc");
 +}
 +
 +module_init(kdbm_vm_init)
 +module_exit(kdbm_vm_exit)
Simple merge
diff --cc kernel/Makefile
Simple merge
Simple merge
diff --cc kernel/cgroup.c
Simple merge
diff --cc kernel/exit.c
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc kernel/kexec.c
  #include <asm/system.h>
  #include <asm/sections.h>
  
 +#ifdef CONFIG_KDB_KDUMP
 +#include <linux/module.h>
 +#include <linux/device.h>
 +#include <linux/kdb.h>
 +#endif
 +
- #ifndef CONFIG_XEN
  /* Per cpu memory for storing cpu states in case of system crash. */
- note_buf_t* crash_notes;
- #endif
+ note_buf_t __percpu *crash_notes;
  
  /* vmcoreinfo stuff */
  static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --cc kernel/ksysfs.c
Simple merge
diff --cc kernel/module.c
Simple merge
diff --cc kernel/panic.c
@@@ -113,17 -122,10 +131,15 @@@ NORET_TYPE void panic(const char * fmt
                 * We can't use the "normal" timers since we just panicked.
                 */
                printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
 -
 +#ifdef CONFIG_BOOTSPLASH
 +              {
 +                      extern int splash_verbose(void);
 +                      (void)splash_verbose();
 +              }
 +#endif
-               for (i = 0; i < panic_timeout*1000; ) {
+               for (i = 0; i < panic_timeout; i++) {
                        touch_nmi_watchdog();
-                       i += panic_blink(i);
-                       mdelay(1);
-                       i++;
+                       panic_blink_one_second();
                }
                /*
                 * This will not be a clean reboot, with everything
        }
  #endif
        local_irq_enable();
 +#ifdef CONFIG_BOOTSPLASH
 +      {
 +              extern int splash_verbose(void);
 +              (void)splash_verbose();
 +      }
 +#endif
-       for (i = 0; ; ) {
+       while (1) {
                touch_softlockup_watchdog();
-               i += panic_blink(i);
-               mdelay(1);
-               i++;
+               panic_blink_one_second();
        }
  }
  
Simple merge
diff --cc kernel/printk.c
@@@ -35,8 -35,7 +35,9 @@@
  #include <linux/kexec.h>
  #include <linux/ratelimit.h>
  #include <linux/kmsg_dump.h>
+ #include <linux/syslog.h>
 +#include <linux/jhash.h>
 +#include <linux/device.h>
  
  #include <asm/uaccess.h>
  
@@@ -419,24 -410,9 +412,24 @@@ out
  
  SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
  {
-       return do_syslog(type, buf, len);
+       return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
  }
  
 +#ifdef CONFIG_DEBUG_KERNEL
 +/* Its very handy to be able to view the syslog buffer during debug.
 + * But do_syslog() uses locks so it cannot be used during debugging.
 + * Instead, provide the start and end of the physical and logical logs.
 + * This is equivalent to do_syslog(3).
 + */
 +void debugger_syslog_data(char *syslog_data[4])
 +{
 +      syslog_data[0] = log_buf;
 +      syslog_data[1] = log_buf + log_buf_len;
 +      syslog_data[2] = log_buf + log_end - (logged_chars < log_buf_len ? logged_chars : log_buf_len);
 +      syslog_data[3] = log_buf + log_end;
 +}
 +#endif   /* CONFIG_DEBUG_KERNEL */
 +
  /*
   * Call the console drivers on a range of log_buf
   */
diff --cc kernel/ptrace.c
Simple merge
diff --cc kernel/sched.c
Simple merge
diff --cc kernel/signal.c
Simple merge
diff --cc kernel/sys.c
Simple merge
diff --cc kernel/sysctl.c
@@@ -1279,24 -1262,6 +1278,13 @@@ static struct ctl_table vm_table[] = 
                .mode           = 0644,
                .proc_handler   = scan_unevictable_handler,
        },
 +      {
 +              .procname       = "heap-stack-gap",
 +              .data           = &heap_stack_gap,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0644,
 +              .proc_handler   = proc_dointvec,
 +      },
- #ifdef CONFIG_PRESWAP
-       {
-               .procname       = "preswap",
-               .data           = NULL,
-               .maxlen         = sizeof(unsigned long),
-               .mode           = 0644,
-               .proc_handler   = preswap_sysctl_handler,
-               .extra1         = (void *)&preswap_zero,
-               .extra2         = (void *)&preswap_infinity,
-       },
- #endif
  #ifdef CONFIG_MEMORY_FAILURE
        {
                .procname       = "memory_failure_early_kill",
Simple merge
Simple merge
Simple merge
diff --cc mm/Makefile
Simple merge
diff --cc mm/filemap.c
Simple merge
diff --cc mm/hugetlb.c
Simple merge
diff --cc mm/memcontrol.c
Simple merge
diff --cc mm/memory.c
Simple merge
diff --cc mm/migrate.c
Simple merge
diff --cc mm/mmap.c
Simple merge
diff --cc mm/page_alloc.c
Simple merge
diff --cc mm/page_io.c
@@@ -99,24 -97,6 +99,17 @@@ int swap_writepage(struct page *page, s
                unlock_page(page);
                goto out;
        }
 +
 +      if (sis->flags & SWP_FILE) {
 +              struct file *swap_file = sis->swap_file;
 +              struct address_space *mapping = swap_file->f_mapping;
 +
 +              ret = mapping->a_ops->swap_out(swap_file, page, wbc);
 +              if (!ret)
 +                      count_vm_event(PSWPOUT);
 +              return ret;
 +      }
 +
-       if (preswap_put(page) == 1) {
-               set_page_writeback(page);
-               unlock_page(page);
-               end_page_writeback(page);
-               goto out;
-       }
        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
@@@ -174,23 -121,6 +167,17 @@@ int swap_readpage(struct page *page
  
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageUptodate(page));
 +
 +      if (sis->flags & SWP_FILE) {
 +              struct file *swap_file = sis->swap_file;
 +              struct address_space *mapping = swap_file->f_mapping;
 +
 +              ret = mapping->a_ops->swap_in(swap_file, page);
 +              if (!ret)
 +                      count_vm_event(PSWPIN);
 +              return ret;
 +      }
 +
-       if (preswap_get(page) == 1) {
-               SetPageUptodate(page);
-               unlock_page(page);
-               goto out;
-       }
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
diff --cc mm/slab.c
Simple merge
diff --cc mm/slub.c
+++ b/mm/slub.c
@@@ -1656,10 -1628,10 +1643,10 @@@ load_freelist
        object = c->page->freelist;
        if (unlikely(!object))
                goto another_slab;
 -      if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
 -              goto debug;
 +      if (unlikely(PageSlubDebug(c->page) || c->reserve))
 +              goto slow_path;
  
-       c->freelist = object[c->offset];
+       c->freelist = get_freepointer(s, object);
        c->page->inuse = c->page->objects;
        c->page->freelist = NULL;
        c->node = page_to_nid(c->page);
@@@ -1689,9 -1660,8 +1676,9 @@@ grow_slab
                local_irq_disable();
  
        if (new) {
-               c = get_cpu_slab(s, smp_processor_id());
+               c = __this_cpu_ptr(s->cpu_slab);
 +              c->reserve = reserve;
-               stat(c, ALLOC_SLAB);
+               stat(s, ALLOC_SLAB);
                if (c->page)
                        flush_slab(s, c);
                slab_lock(new);
        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
                slab_out_of_memory(s, gfpflags, node);
        return NULL;
 -debug:
 -      if (!alloc_debug_processing(s, c->page, object, addr))
 +
 +slow_path:
 +      if (PageSlubDebug(c->page) &&
 +                      !alloc_debug_processing(s, c->page, object, addr))
                goto another_slab;
  
 +      /*
 +       * Avoid the slub fast path in slab_alloc() by not setting
 +       * c->freelist and the fast path in slab_free() by making
 +       * node_match() fail by setting c->node to -1.
 +       *
 +       * We use this for for debug and reserve checks which need
 +       * to be done for each allocation.
 +       */
 +
        c->page->inuse++;
-       c->page->freelist = object[c->offset];
+       c->page->freelist = get_freepointer(s, object);
        c->node = -1;
        goto unlock_out;
  }
diff --cc mm/swapfile.c
Simple merge
diff --cc mm/vmscan.c
Simple merge
diff --cc mm/vmstat.c
Simple merge
Simple merge
diff --cc net/core/dev.c
@@@ -2514,9 -2484,9 +2512,10 @@@ int netif_receive_skb(struct sk_buff *s
        struct packet_type *ptype, *pt_prev;
        struct net_device *orig_dev;
        struct net_device *null_or_orig;
+       struct net_device *null_or_bond;
        int ret = NET_RX_DROP;
        __be16 type;
 +      unsigned long pflags = current->flags;
  
        if (!skb->tstamp.tv64)
                net_timestamp(skb);
        }
  #endif
  
- #ifdef CONFIG_XEN
-       switch (skb->ip_summed) {
-       case CHECKSUM_UNNECESSARY:
-               skb->proto_data_valid = 1;
-               break;
-       case CHECKSUM_PARTIAL:
-               /* XXX Implement me. */
-       default:
-               skb->proto_data_valid = 0;
-               break;
-       }
- #endif
 +      if (skb_emergency(skb))
 +              goto skip_taps;
 +
        list_for_each_entry_rcu(ptype, &ptype_all, list) {
                if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
                    ptype->dev == orig_dev) {
@@@ -2602,16 -2543,25 +2588,28 @@@ skip_taps
  ncls:
  #endif
  
 +      if (!skb_emergency_protocol(skb))
 +              goto drop;
 +
        skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
        if (!skb)
 -              goto out;
 +              goto unlock;
        skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
        if (!skb)
 -              goto out;
 +              goto unlock;
  
+       /*
+        * Make sure frames received on VLAN interfaces stacked on
+        * bonding interfaces still make their way to any base bonding
+        * device that may have registered for a specific ptype.  The
+        * handler may have to adjust skb->dev and orig_dev.
+        */
+       null_or_bond = NULL;
+       if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
+           (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
+               null_or_bond = vlan_dev_real_dev(skb->dev);
+       }
        type = skb->protocol;
        list_for_each_entry_rcu(ptype,
                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Simple merge
Simple merge
diff --cc net/core/sock.c
Simple merge
@@@ -734,10 -729,8 +759,10 @@@ static inline void ip4_frags_ctl_regist
  }
  #endif
  
- static int ipv4_frags_init_net(struct net *net)
+ static int __net_init ipv4_frags_init_net(struct net *net)
  {
 +      int ret;
 +
        /*
         * Fragment cache limits. We will commit 256K at one time. Should we
         * cross that limit we will prune down to 192K. This should cope with
  
        inet_frags_init_net(&net->ipv4.frags);
  
 -      return ip4_frags_ns_ctl_register(net);
 +      ret = ip4_frags_ns_ctl_register(net);
 +      if (ret)
 +              goto out_reg;
 +
 +      mem_reserve_init(&net->ipv4.frags.reserve, "IPv4 fragment cache",
 +                      &net_skb_reserve);
 +      ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
 +                      net->ipv4.frags.high_thresh);
 +      if (ret)
 +              goto out_reserve;
 +
 +      return 0;
 +
 +out_reserve:
 +      mem_reserve_disconnect(&net->ipv4.frags.reserve);
 +      ip4_frags_ns_ctl_unregister(net);
 +out_reg:
 +      inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
 +
 +      return ret;
  }
  
- static void ipv4_frags_exit_net(struct net *net)
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
  {
 +      mem_reserve_disconnect(&net->ipv4.frags.reserve);
        ip4_frags_ns_ctl_unregister(net);
        inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
  }
Simple merge
diff --cc net/ipv4/tcp.c
Simple merge
Simple merge
Simple merge
@@@ -2792,10 -2804,9 +2804,10 @@@ static void addrconf_dad_start(struct i
        read_lock_bh(&idev->lock);
        if (ifp->dead)
                goto out;
-       spin_lock_bh(&ifp->lock);
  
+       spin_lock(&ifp->lock);
        if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
 +          !(dev->flags&IFF_MULTICAST) ||
            idev->cnf.accept_dad < 1 ||
            !(ifp->flags&IFA_F_TENTATIVE) ||
            ifp->flags & IFA_F_NODAD) {
@@@ -774,41 -740,19 +769,41 @@@ static inline void ip6_frags_sysctl_unr
  }
  #endif
  
- static int ipv6_frags_init_net(struct net *net)
+ static int __net_init ipv6_frags_init_net(struct net *net)
  {
 +      int ret;
 +
-       net->ipv6.frags.high_thresh = 256 * 1024;
-       net->ipv6.frags.low_thresh = 192 * 1024;
+       net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
        net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
  
        inet_frags_init_net(&net->ipv6.frags);
  
 -      return ip6_frags_ns_sysctl_register(net);
 +      ret = ip6_frags_ns_sysctl_register(net);
 +      if (ret)
 +              goto out_reg;
 +
 +      mem_reserve_init(&net->ipv6.frags.reserve, "IPv6 fragment cache",
 +                       &net_skb_reserve);
 +      ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
 +                                    net->ipv6.frags.high_thresh);
 +      if (ret)
 +              goto out_reserve;
 +
 +      return 0;
 +
 +out_reserve:
 +      mem_reserve_disconnect(&net->ipv6.frags.reserve);
 +      ip6_frags_ns_sysctl_unregister(net);
 +out_reg:
 +      inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
 +
 +      return ret;
  }
  
- static void ipv6_frags_exit_net(struct net *net)
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
  {
 +      mem_reserve_disconnect(&net->ipv6.frags.reserve);
        ip6_frags_ns_sysctl_unregister(net);
        inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
  }
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index a48b5f4,0000000..8881a22
mode 100644,000000..100644
--- /dev/null
@@@ -1,183 -1,0 +1,187 @@@
 +/*
 + * AppArmor security module
 + *
 + * This file contains AppArmor function for pathnames
 + *
 + * Copyright (C) 1998-2008 Novell/SUSE
 + * Copyright 2009 Canonical Ltd.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License as
 + * published by the Free Software Foundation, version 2 of the
 + * License.
 + */
 +
 +#include <linux/mnt_namespace.h>
 +#include <linux/mount.h>
 +#include <linux/namei.h>
 +#include <linux/path.h>
 +#include <linux/sched.h>
 +#include <linux/slab.h>
 +#include <linux/fs_struct.h>
 +
 +#include "include/apparmor.h"
 +#include "include/path.h"
 +
 +int aa_get_name_to_buffer(struct path *path, int is_dir, char *buffer, int size,
 +                        char **name)
 +{
 +      int error = d_namespace_path(path, buffer, size - is_dir, name);
 +
 +      if (!error && is_dir && (*name)[1] != '\0')
 +              /*
 +               * Append "/" to the pathname.  The root directory is a special
 +               * case; it already ends in slash.
 +               */
 +              strcpy(&buffer[size - 2], "/");
 +
 +      return error;
 +}
 +
 +/**
 + * aa_get_name - compute the pathname of a file
 + * @path: path the file
 + * @is_dir: set if the file is a directory
 + * @buffer: buffer that aa_get_name() allocated
 + * @name: the error code indicating whether aa_get_name failed
 + *
 + * Returns an error code if the there was a failure in obtaining the
 + * name.
 + *
 + * @name is apointer to the beginning of the pathname (which usually differs
 + * from the beginning of the buffer), or NULL.  If there is an error @name
 + * may contain a partial or invalid name (in the case of a deleted file), that
 + * can be used for audit purposes, but it can not be used for mediation.
 + *
 + * We need @is_dir to indicate whether the file is a directory or not because
 + * the file may not yet exist, and so we cannot check the inode's file type.
 + */
 +int aa_get_name(struct path *path, int is_dir, char **buffer, char **name)
 +{
 +      char *buf, *str = NULL;
 +      int size = 256;
 +      int error;
 +
 +      *name = NULL;
 +      *buffer = NULL;
 +      for (;;) {
 +              buf = kmalloc(size, GFP_KERNEL);
 +              if (!buf)
 +                      return -ENOMEM;
 +
 +              error = aa_get_name_to_buffer(path, is_dir, buf, size, &str);
 +              if (!error || (error == -ENOENT) || (error == -ESTALE))
 +                      break;
 +
 +              kfree(buf);
 +              size <<= 1;
 +              if (size > g_apparmor_path_max)
 +                      return -ENAMETOOLONG;
 +      }
 +      *buffer = buf;
 +      *name = str;
 +
 +      return error;
 +}
 +
++/* Only needed until d_namespace_path is cleaned up and doesn't use
++ * vfsmount_lock anymore. -jeffm */
++extern spinlock_t vfsmount_lock;
++
 +int d_namespace_path(struct path *path, char *buf, int buflen, char **name)
 +{
 +      struct path root, tmp, ns_root = { };
 +      char *res;
 +      int deleted;
 +      int error = 0;
 +
 +      read_lock(&current->fs->lock);
 +      root = current->fs->root;
 +      path_get(&current->fs->root);
 +      read_unlock(&current->fs->lock);
 +      spin_lock(&vfsmount_lock);
 +      if (root.mnt && root.mnt->mnt_ns)
 +              ns_root.mnt = mntget(root.mnt->mnt_ns->root);
 +      if (ns_root.mnt)
 +              ns_root.dentry = dget(ns_root.mnt->mnt_root);
 +      spin_unlock(&vfsmount_lock);
 +      spin_lock(&dcache_lock);
 +
 +      do {
 +              tmp = ns_root;
 +              deleted = d_unlinked(path->dentry);
 +              res = __d_path(path, &tmp, buf, buflen);
 +      } while (deleted != d_unlinked(path->dentry));
 +
 +      *name = res;
 +      /* handle error conditions - and still allow a partial path to
 +       * be returned */
 +      if (IS_ERR(res)) {
 +              error = PTR_ERR(res);
 +              *name = buf;
 +      } else if (deleted) {
 +              /* The stripping of (deleted) is a hack that could be removed
 +               * with an updated __d_path
 +               */
 +
 +              /* Currently 2 cases fall into here.  Fixing the mediation
 +               * of deleted files for things like trunc.
 +               * And the newly allocated dentry case.  The first case
 +               * means we strip deleted for everything so the new
 +               * dentry test case is commented out below.
 +               */
 +              buf[buflen - 11] = 0;   /* - (len(" (deleted)") +\0) */
 +
 +              /* if (!path->dentry->d_inode) {
 +               * On some filesystems, newly allocated dentries appear
 +               * to the security_path hooks as a deleted
 +               * dentry except without an inode allocated.
 +               *
 +               * Remove the appended deleted text and return as a
 +               * string for normal mediation.  The (deleted) string
 +               * is guarenteed to be added in this case, so just
 +               * strip it.
 +               */
 +      } else if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) {
 +              error = -ENOENT;
 +#if 0
 +      } else if (tmp.dentry != ns_root.dentry && tmp.mnt != ns_root.mnt) {
 +              /* disconnected path don return pathname starting with '/' */
 +              error = -ESTALE;
 +              if (*res == '/')
 +                      *name = res + 1;
 +#endif
 +      }
 +
 +      spin_unlock(&dcache_lock);
 +      path_put(&root);
 +      path_put(&ns_root);
 +
 +      return error;
 +}
 +
 +char *sysctl_pathname(struct ctl_table *table, char *buffer, int buflen)
 +{
 +      if (buflen < 1)
 +              return NULL;
 +      buffer += --buflen;
 +      *buffer = '\0';
 +
 +      while (table) {
 +              int namelen = strlen(table->procname);
 +
 +              if (buflen < namelen + 1)
 +                      return NULL;
 +              buflen -= namelen + 1;
 +              buffer -= namelen;
 +              memcpy(buffer, table->procname, namelen);
 +              *--buffer = '/';
 +              table = table->parent;
 +      }
 +      if (buflen < 4)
 +              return NULL;
 +      buffer -= 4;
 +      memcpy(buffer, "/sys", 4);
 +
 +      return buffer;
 +}
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge