core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/
core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/
core-$(CONFIG_KVM) += arch/ia64/kvm/
- core-$(CONFIG_PARAVIRT_XEN) += arch/ia64/xen/
+ core-$(CONFIG_XEN) += arch/ia64/xen/
+drivers-$(CONFIG_KDB) += arch/$(ARCH)/kdb/
drivers-$(CONFIG_PCI) += arch/ia64/pci/
drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/
drivers-$(CONFIG_IA64_HP_ZX1) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
boot := arch/s390/boot
- all: image kerntypes.o
-all: image bzImage
++all: image bzImage kerntypes.o
install: vmlinux
$(Q)$(MAKE) $(build)=$(boot) $@
- image kerntypes.o: vmlinux
-image bzImage: vmlinux
++image bzImage kerntypes.o: vmlinux
$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
zfcpdump:
# Makefile for the linux s390-specific parts of the memory manager.
#
-COMPILE_VERSION := __linux_compile_version_id__`hostname | \
- tr -c '[0-9A-Za-z]' '_'`__`date | \
- tr -c '[0-9A-Za-z]' '_'`_t
+COMPILE_VERSION := __linux_compile_version_id__$(shell hostname | \
+ tr -c '[0-9A-Za-z]' '_')__$(shell date | \
+ tr -c '[0-9A-Za-z]' '_')_t
+
+chk-option = $(shell if $(CC) $(CFLAGS) $(1) -S -o /dev/null -xc /dev/null \
+ > /dev/null 2>&1; then echo "$(1)"; fi ;)
+
+# Remove possible '-g' from CFLAGS_KERNEL, since we want to use stabs
+# debug format.
+override CFLAGS_KERNEL := $(shell echo $(CFLAGS_KERNEL) | sed 's/-g//')
EXTRA_CFLAGS := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I.
+# Assume we don't need the flag if the compiler doesn't know about it
+EXTRA_CFLAGS += $(call chk-option,-fno-eliminate-unused-debug-types)
+
- targets := image kerntypes.o
+ targets := image
+ targets += bzImage
+ subdir- := compressed
++targets += kerntypes.o
$(obj)/image: vmlinux FORCE
$(call if_changed,objcopy)
+ $(obj)/bzImage: $(obj)/compressed/vmlinux FORCE
+ $(call if_changed,objcopy)
+
+ $(obj)/compressed/vmlinux: FORCE
+ $(Q)$(MAKE) $(build)=$(obj)/compressed $@
+
install: $(CONFIGURE) $(obj)/image
sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/image \
- System.map Kerntypes "$(INSTALL_PATH)"
+ System.map "$(INSTALL_PATH)"
If unsure, or if you run an older (pre 4.4) gcc, say N.
+config KDB
+ bool "Built-in Kernel Debugger support"
- depends on DEBUG_KERNEL && !XEN
++ depends on DEBUG_KERNEL
+ select KALLSYMS
+ select KALLSYMS_ALL
+ help
+ This option provides a built-in kernel debugger. The built-in
+ kernel debugger contains commands which allow memory to be examined,
+ instructions to be disassembled and breakpoints to be set. For details,
+ see Documentation/kdb/kdb.mm and the manual pages kdb_bt, kdb_ss, etc.
+ Kdb can also be used via the serial port. Set up the system to
+ have a serial console (see Documentation/serial-console.txt).
+ The key sequence <escape>KDB on the serial port will cause the
+ kernel debugger to be entered with input from the serial port and
+ output to the serial console. If unsure, say N.
+
+config KDB_MODULES
+ tristate "KDB modules"
+ depends on KDB
+ help
+ KDB can be extended by adding your own modules, in directory
+ kdb/modules. This option selects the way that these modules should
+ be compiled, as free standing modules (select M) or built into the
+ kernel (select Y). If unsure say M.
+
+config KDB_OFF
+ bool "KDB off by default"
+ depends on KDB
+ help
+ Normally kdb is activated by default, as long as CONFIG_KDB is set.
+ If you want to ship a kernel with kdb support but only have kdb
+ turned on when the user requests it then select this option. When
+ compiled with CONFIG_KDB_OFF, kdb ignores all events unless you boot
+ with kdb=on or you echo "1" > /proc/sys/kernel/kdb. This option also
+ works in reverse, if kdb is normally activated, you can boot with
+ kdb=off or echo "0" > /proc/sys/kernel/kdb to deactivate kdb. If
+ unsure, say N.
+
+config KDB_CONTINUE_CATASTROPHIC
+ int "KDB continues after catastrophic errors"
+ depends on KDB
+ default "0"
+ help
+ This integer controls the behaviour of kdb when the kernel gets a
+ catastrophic error, i.e. for a panic, oops, NMI or other watchdog
+ tripping. CONFIG_KDB_CONTINUE_CATASTROPHIC interacts with
+ /proc/sys/kernel/kdb and CONFIG_LKCD_DUMP (if your kernel has the
+ LKCD patch).
+ When KDB is active (/proc/sys/kernel/kdb == 1) and a catastrophic
+ error occurs, nothing extra happens until you type 'go'.
+ CONFIG_KDB_CONTINUE_CATASTROPHIC == 0 (default). The first time
+ you type 'go', kdb warns you. The second time you type 'go', KDB
+ tries to continue - no guarantees that the kernel is still usable.
+ CONFIG_KDB_CONTINUE_CATASTROPHIC == 1. KDB tries to continue - no
+ guarantees that the kernel is still usable.
+ CONFIG_KDB_CONTINUE_CATASTROPHIC == 2. If your kernel has the LKCD
+ patch and LKCD is configured to take a dump then KDB forces a dump.
+ Whether or not a dump is taken, KDB forces a reboot.
+ When KDB is not active (/proc/sys/kernel/kdb == 0) and a catastrophic
+ error occurs, the following steps are automatic, no human
+ intervention is required.
+ CONFIG_KDB_CONTINUE_CATASTROPHIC == 0 (default) or 1. KDB attempts
+ to continue - no guarantees that the kernel is still usable.
+ CONFIG_KDB_CONTINUE_CATASTROPHIC == 2. If your kernel has the LKCD
+ patch and LKCD is configured to take a dump then KDB automatically
+ forces a dump. Whether or not a dump is taken, KDB forces a
+ reboot.
+ If you are not sure, say 0. Read Documentation/kdb/dump.txt before
+ setting to 2.
+
+config KDB_USB
+ bool "Support for USB Keyboard in KDB"
+ depends on KDB && (USB_OHCI_HCD || USB_EHCI_HCD || USB_UHCI_HCD)
+ help
+ If you want to use kdb from USB keyboards then say Y here. If you
+ say N then kdb can only be used from a PC (AT) keyboard or a serial
+ console.
+
+config KDB_KDUMP
+ bool "Support for Kdump in KDB"
+ depends on KDB
+ select KEXEC
+ default N
+ help
+ If you want to take Kdump kernel vmcore from KDB then say Y here.
+ If unsure, say N.
+
endmenu
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
- ifeq ($(CONFIG_X86_32),y)
drivers-$(CONFIG_FB) += arch/x86/video/
- endif
+# KDB support
+drivers-$(CONFIG_KDB) += arch/x86/kdb/
+
####
# boot loader support. Several targets are kept for legacy purposes
extern void iounmap(volatile void __iomem *addr);
- #ifdef CONFIG_X86_32
- # include "io_32.h"
+ #ifdef __KERNEL__
+
+ #include <asm-generic/iomap.h>
+
+ #include <linux/vmalloc.h>
+
+ /*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+ #define xlate_dev_kmem_ptr(p) p
+
+ static inline void
+ memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
+ {
+ memset((void __force *)addr, val, count);
+ }
+
+ static inline void
+ memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
+ {
+ memcpy(dst, (const void __force *)src, count);
+ }
+
+ static inline void
+ memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
+ {
+ memcpy((void __force *)dst, src, count);
+ }
+
+ /*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+ /*
+ * Cache management
+ *
+ * This needed for two cases
+ * 1. Out of order aware processors
+ * 2. Accidentally out of order processors (PPro errata #51)
+ */
+
+ static inline void flush_write_buffers(void)
+ {
+ #if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+ asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+ #endif
+ }
+
+ #endif /* __KERNEL__ */
+
+ extern void native_io_delay(void);
+
+ extern int io_delay_type;
+ extern void io_delay_init(void);
+
-#if defined(CONFIG_PARAVIRT)
++#if defined(CONFIG_PARAVIRT_CPU)
+ #include <asm/paravirt.h>
#else
- # include "io_64.h"
+
+ static inline void slow_down_io(void)
+ {
+ native_io_delay();
+ #ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
#endif
+ }
+
+ #endif
+
+ #define BUILDIO(bwl, bw, type) \
+ static inline void out##bwl(unsigned type value, int port) \
+ { \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+ } \
+ \
+ static inline unsigned type in##bwl(int port) \
+ { \
+ unsigned type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+ } \
+ \
+ static inline void out##bwl##_p(unsigned type value, int port) \
+ { \
+ out##bwl(value, port); \
+ slow_down_io(); \
+ } \
+ \
+ static inline unsigned type in##bwl##_p(int port) \
+ { \
+ unsigned type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+ } \
+ \
+ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+ { \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) : "d"(port)); \
+ } \
+ \
+ static inline void ins##bwl(int port, void *addr, unsigned long count) \
+ { \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) : "d"(port)); \
+ }
+
+ BUILDIO(b, b, char)
+ BUILDIO(w, w, short)
+ BUILDIO(l, , int)
extern void *xlate_dev_mem_ptr(unsigned long phys);
extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
*/
#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+ #define IA32_SYSCALL_VECTOR 0x80
+ #ifdef CONFIG_X86_32
+ # define SYSCALL_VECTOR 0x80
+ #endif
++#define KDBENTER_VECTOR 0x81
+
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
+ * round up to the next 16-vector boundary
*/
- #define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
+ #define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
#endif
#ifdef CONFIG_X86_64
- #if defined(CONFIG_PARAVIRT_MMU) || defined(CONFIG_XEN)
-#ifdef CONFIG_PARAVIRT
++#ifdef CONFIG_PARAVIRT_MMU
/* Paravirtualized systems may not have PSE or PGE available */
#define NEED_PSE 0
#define NEED_PGE 0
--- /dev/null
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2006, 2007-2009 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * Common code for doing accurate backtraces on i386 and x86_64, including
+ * printing the values of arguments.
+ */
+
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kdb.h>
+#include <linux/kdbprivate.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/stringify.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/nmi.h>
+#include <asm/asm-offsets.h>
+#include <asm/system.h>
+
+#define KDB_DEBUG_BB(fmt, ...) \
+ {if (KDB_DEBUG(BB)) kdb_printf(fmt, ## __VA_ARGS__);}
+#define KDB_DEBUG_BB_OFFSET_PRINTF(offset, prefix, suffix) \
+ kdb_printf(prefix "%c0x%x" suffix, \
+ offset >= 0 ? '+' : '-', \
+ offset >= 0 ? offset : -offset)
+#define KDB_DEBUG_BB_OFFSET(offset, prefix, suffix) \
+ {if (KDB_DEBUG(BB)) KDB_DEBUG_BB_OFFSET_PRINTF(offset, prefix, suffix);}
+
+#define BB_CHECK(expr, val, ret) \
+({ \
+ if (unlikely(expr)) { \
+ kdb_printf("%s, line %d: BB_CHECK(" #expr ") failed " \
+ #val "=%lx\n", \
+ __FUNCTION__, __LINE__, (long)val); \
+ bb_giveup = 1; \
+ return ret; \
+ } \
+})
+
+static int bb_giveup;
+
+/* Use BBRG_Rxx for both i386 and x86_64. RAX through R15 must be at the end,
+ * starting with RAX. Some of these codes do not reflect actual registers,
+ * such codes are special cases when parsing the record of register changes.
+ * When updating BBRG_ entries, update bbrg_name as well.
+ */
+
+enum bb_reg_code
+{
+ BBRG_UNDEFINED = 0, /* Register contents are undefined */
+ BBRG_OSP, /* original stack pointer on entry to function */
+ BBRG_RAX,
+ BBRG_RBX,
+ BBRG_RCX,
+ BBRG_RDX,
+ BBRG_RDI,
+ BBRG_RSI,
+ BBRG_RBP,
+ BBRG_RSP,
+ BBRG_R8,
+ BBRG_R9,
+ BBRG_R10,
+ BBRG_R11,
+ BBRG_R12,
+ BBRG_R13,
+ BBRG_R14,
+ BBRG_R15,
+};
+
+const static char *bbrg_name[] = {
+ [BBRG_UNDEFINED] = "undefined",
+ [BBRG_OSP] = "osp",
+ [BBRG_RAX] = "rax",
+ [BBRG_RBX] = "rbx",
+ [BBRG_RCX] = "rcx",
+ [BBRG_RDX] = "rdx",
+ [BBRG_RDI] = "rdi",
+ [BBRG_RSI] = "rsi",
+ [BBRG_RBP] = "rbp",
+ [BBRG_RSP] = "rsp",
+ [BBRG_R8] = "r8",
+ [BBRG_R9] = "r9",
+ [BBRG_R10] = "r10",
+ [BBRG_R11] = "r11",
+ [BBRG_R12] = "r12",
+ [BBRG_R13] = "r13",
+ [BBRG_R14] = "r14",
+ [BBRG_R15] = "r15",
+};
+
+/* Map a register name to its register code. This includes the sub-register
+ * addressable fields, e.g. parts of rax can be addressed as ax, al, ah, eax.
+ * The list is sorted so it can be binary chopped, sort command is:
+ * LANG=C sort -t '"' -k2
+ */
+
+struct bb_reg_code_map {
+ enum bb_reg_code reg;
+ const char *name;
+};
+
+const static struct bb_reg_code_map
+bb_reg_code_map[] = {
+ { BBRG_RAX, "ah" },
+ { BBRG_RAX, "al" },
+ { BBRG_RAX, "ax" },
+ { BBRG_RBX, "bh" },
+ { BBRG_RBX, "bl" },
+ { BBRG_RBP, "bp" },
+ { BBRG_RBP, "bpl" },
+ { BBRG_RBX, "bx" },
+ { BBRG_RCX, "ch" },
+ { BBRG_RCX, "cl" },
+ { BBRG_RCX, "cx" },
+ { BBRG_RDX, "dh" },
+ { BBRG_RDI, "di" },
+ { BBRG_RDI, "dil" },
+ { BBRG_RDX, "dl" },
+ { BBRG_RDX, "dx" },
+ { BBRG_RAX, "eax" },
+ { BBRG_RBP, "ebp" },
+ { BBRG_RBX, "ebx" },
+ { BBRG_RCX, "ecx" },
+ { BBRG_RDI, "edi" },
+ { BBRG_RDX, "edx" },
+ { BBRG_RSI, "esi" },
+ { BBRG_RSP, "esp" },
+ { BBRG_R10, "r10" },
+ { BBRG_R10, "r10d" },
+ { BBRG_R10, "r10l" },
+ { BBRG_R10, "r10w" },
+ { BBRG_R11, "r11" },
+ { BBRG_R11, "r11d" },
+ { BBRG_R11, "r11l" },
+ { BBRG_R11, "r11w" },
+ { BBRG_R12, "r12" },
+ { BBRG_R12, "r12d" },
+ { BBRG_R12, "r12l" },
+ { BBRG_R12, "r12w" },
+ { BBRG_R13, "r13" },
+ { BBRG_R13, "r13d" },
+ { BBRG_R13, "r13l" },
+ { BBRG_R13, "r13w" },
+ { BBRG_R14, "r14" },
+ { BBRG_R14, "r14d" },
+ { BBRG_R14, "r14l" },
+ { BBRG_R14, "r14w" },
+ { BBRG_R15, "r15" },
+ { BBRG_R15, "r15d" },
+ { BBRG_R15, "r15l" },
+ { BBRG_R15, "r15w" },
+ { BBRG_R8, "r8" },
+ { BBRG_R8, "r8d" },
+ { BBRG_R8, "r8l" },
+ { BBRG_R8, "r8w" },
+ { BBRG_R9, "r9" },
+ { BBRG_R9, "r9d" },
+ { BBRG_R9, "r9l" },
+ { BBRG_R9, "r9w" },
+ { BBRG_RAX, "rax" },
+ { BBRG_RBP, "rbp" },
+ { BBRG_RBX, "rbx" },
+ { BBRG_RCX, "rcx" },
+ { BBRG_RDI, "rdi" },
+ { BBRG_RDX, "rdx" },
+ { BBRG_RSI, "rsi" },
+ { BBRG_RSP, "rsp" },
+ { BBRG_RSI, "si" },
+ { BBRG_RSI, "sil" },
+ { BBRG_RSP, "sp" },
+ { BBRG_RSP, "spl" },
+};
+
+/* Record register contents in terms of the values that were passed to this
+ * function, IOW track which registers contain an input value. A register's
+ * contents can be undefined, it can contain an input register value or it can
+ * contain an offset from the original stack pointer.
+ *
+ * This structure is used to represent the current contents of the integer
+ * registers, it is held in an array that is indexed by BBRG_xxx. The element
+ * for BBRG_xxx indicates what input value is currently in BBRG_xxx. When
+ * 'value' is BBRG_OSP then register BBRG_xxx contains a stack pointer,
+ * pointing at 'offset' from the original stack pointer on entry to the
+ * function. When 'value' is not BBRG_OSP then element BBRG_xxx contains the
+ * original contents of an input register and offset is ignored.
+ *
+ * An input register 'value' can be stored in more than one register and/or in
+ * more than one memory location.
+ */
+
+struct bb_reg_contains
+{
+ enum bb_reg_code value: 8;
+ short offset;
+};
+
+/* Note: the offsets in struct bb_mem_contains in this code are _NOT_ offsets
+ * from OSP, they are offsets from current RSP. It fits better with the way
+ * that struct pt_regs is built, some code pushes extra data before pt_regs so
+ * working with OSP relative offsets gets messy. struct bb_mem_contains
+ * entries must be in descending order of RSP offset.
+ */
+
+typedef struct { DECLARE_BITMAP(bits, BBRG_R15+1); } bbrgmask_t;
+#define BB_SKIP(reg) (1 << (BBRG_ ## reg))
+struct bb_mem_contains {
+ short offset_address;
+ enum bb_reg_code value: 8;
+};
+
+/* Transfer of control to a label outside the current function. If the
+ * transfer is to a known common restore path that expects known registers
+ * and/or a known memory state (e.g. struct pt_regs) then do a sanity check on
+ * the state at this point.
+ */
+
+struct bb_name_state {
+ const char *name; /* target function */
+ bfd_vma address; /* Address of target function */
+ const char *fname; /* optional from function name */
+ const struct bb_mem_contains *mem; /* expected memory state */
+ const struct bb_reg_contains *regs; /* expected register state */
+ const unsigned short mem_size; /* ARRAY_SIZE(mem) */
+ const unsigned short regs_size; /* ARRAY_SIZE(regs) */
+ const short osp_offset; /* RSP in regs == OSP+osp_offset */
+ const bbrgmask_t skip_mem; /* Some slots in mem may be undefined */
+ const bbrgmask_t skip_regs; /* Some slots in regs may be undefined */
+};
+
+/* NS (NAME_STATE) macros define the register and memory state when we transfer
+ * control to or start decoding a special case name. Use NS when the target
+ * label always has the same state. Use NS_FROM and specify the source label
+ * if the target state is slightly different depending on where it is branched
+ * from. This gives better state checking, by isolating the special cases.
+ *
+ * Note: for the same target label, NS_FROM entries must be followed by a
+ * single NS entry.
+ */
+
+#define NS_FROM(iname, ifname, imem, iregs, iskip_mem, iskip_regs, iosp_offset) \
+ { \
+ .name = iname, \
+ .fname = ifname, \
+ .mem = imem, \
+ .regs = iregs, \
+ .mem_size = ARRAY_SIZE(imem), \
+ .regs_size = ARRAY_SIZE(iregs), \
+ .skip_mem.bits[0] = iskip_mem, \
+ .skip_regs.bits[0] = iskip_regs, \
+ .osp_offset = iosp_offset, \
+ .address = 0 \
+ }
+
+/* Shorter forms for the common cases */
+#define NS(iname, imem, iregs, iskip_mem, iskip_regs, iosp_offset) \
+ NS_FROM(iname, NULL, imem, iregs, iskip_mem, iskip_regs, iosp_offset)
+#define NS_MEM(iname, imem, iskip_mem) \
+ NS_FROM(iname, NULL, imem, no_regs, iskip_mem, 0, 0)
+#define NS_MEM_FROM(iname, ifname, imem, iskip_mem) \
+ NS_FROM(iname, ifname, imem, no_regs, iskip_mem, 0, 0)
+#define NS_REG(iname, iregs, iskip_regs) \
+ NS_FROM(iname, NULL, no_memory, iregs, 0, iskip_regs, 0)
+#define NS_REG_FROM(iname, ifname, iregs, iskip_regs) \
+ NS_FROM(iname, ifname, no_memory, iregs, 0, iskip_regs, 0)
+
+static void
+bb_reg_code_set_value(enum bb_reg_code dst, enum bb_reg_code src);
+
+static const char *bb_mod_name, *bb_func_name;
+
+static int
+bb_noret(const char *name)
+{
+ if (strcmp(name, "panic") == 0 ||
+ strcmp(name, "do_exit") == 0 ||
+ strcmp(name, "do_group_exit") == 0 ||
+ strcmp(name, "complete_and_exit") == 0)
+ return 1;
+ return 0;
+}
+
+/*============================================================================*/
+/* */
+/* Most of the basic block code and data is common to x86_64 and i386. This */
+/* large ifdef contains almost all of the differences between the two */
+/* architectures. */
+/* */
+/* Make sure you update the correct section of this ifdef. */
+/* */
+/*============================================================================*/
+
+#ifdef CONFIG_X86_64
+
+/* Registers that can be used to pass parameters, in the order that parameters
+ * are passed.
+ */
+
+const static enum bb_reg_code
+bb_param_reg[] = {
+ BBRG_RDI,
+ BBRG_RSI,
+ BBRG_RDX,
+ BBRG_RCX,
+ BBRG_R8,
+ BBRG_R9,
+};
+
+const static enum bb_reg_code
+bb_preserved_reg[] = {
+ BBRG_RBX,
+ BBRG_RBP,
+ BBRG_RSP,
+ BBRG_R12,
+ BBRG_R13,
+ BBRG_R14,
+ BBRG_R15,
+};
+
+static const struct bb_mem_contains full_pt_regs[] = {
+ { 0x70, BBRG_RDI },
+ { 0x68, BBRG_RSI },
+ { 0x60, BBRG_RDX },
+ { 0x58, BBRG_RCX },
+ { 0x50, BBRG_RAX },
+ { 0x48, BBRG_R8 },
+ { 0x40, BBRG_R9 },
+ { 0x38, BBRG_R10 },
+ { 0x30, BBRG_R11 },
+ { 0x28, BBRG_RBX },
+ { 0x20, BBRG_RBP },
+ { 0x18, BBRG_R12 },
+ { 0x10, BBRG_R13 },
+ { 0x08, BBRG_R14 },
+ { 0x00, BBRG_R15 },
+};
+static const struct bb_mem_contains full_pt_regs_plus_1[] = {
+ { 0x78, BBRG_RDI },
+ { 0x70, BBRG_RSI },
+ { 0x68, BBRG_RDX },
+ { 0x60, BBRG_RCX },
+ { 0x58, BBRG_RAX },
+ { 0x50, BBRG_R8 },
+ { 0x48, BBRG_R9 },
+ { 0x40, BBRG_R10 },
+ { 0x38, BBRG_R11 },
+ { 0x30, BBRG_RBX },
+ { 0x28, BBRG_RBP },
+ { 0x20, BBRG_R12 },
+ { 0x18, BBRG_R13 },
+ { 0x10, BBRG_R14 },
+ { 0x08, BBRG_R15 },
+};
+/*
+ * Going into error_exit we have the hardware pushed error_code on the stack
+ * plus a full pt_regs
+ */
+static const struct bb_mem_contains error_code_full_pt_regs[] = {
+ { 0x78, BBRG_UNDEFINED },
+ { 0x70, BBRG_RDI },
+ { 0x68, BBRG_RSI },
+ { 0x60, BBRG_RDX },
+ { 0x58, BBRG_RCX },
+ { 0x50, BBRG_RAX },
+ { 0x48, BBRG_R8 },
+ { 0x40, BBRG_R9 },
+ { 0x38, BBRG_R10 },
+ { 0x30, BBRG_R11 },
+ { 0x28, BBRG_RBX },
+ { 0x20, BBRG_RBP },
+ { 0x18, BBRG_R12 },
+ { 0x10, BBRG_R13 },
+ { 0x08, BBRG_R14 },
+ { 0x00, BBRG_R15 },
+};
+static const struct bb_mem_contains partial_pt_regs[] = {
+ { 0x40, BBRG_RDI },
+ { 0x38, BBRG_RSI },
+ { 0x30, BBRG_RDX },
+ { 0x28, BBRG_RCX },
+ { 0x20, BBRG_RAX },
+ { 0x18, BBRG_R8 },
+ { 0x10, BBRG_R9 },
+ { 0x08, BBRG_R10 },
+ { 0x00, BBRG_R11 },
+};
+static const struct bb_mem_contains partial_pt_regs_plus_1[] = {
+ { 0x48, BBRG_RDI },
+ { 0x40, BBRG_RSI },
+ { 0x38, BBRG_RDX },
+ { 0x30, BBRG_RCX },
+ { 0x28, BBRG_RAX },
+ { 0x20, BBRG_R8 },
+ { 0x18, BBRG_R9 },
+ { 0x10, BBRG_R10 },
+ { 0x08, BBRG_R11 },
+};
+static const struct bb_mem_contains partial_pt_regs_plus_2[] = {
+ { 0x50, BBRG_RDI },
+ { 0x48, BBRG_RSI },
+ { 0x40, BBRG_RDX },
+ { 0x38, BBRG_RCX },
+ { 0x30, BBRG_RAX },
+ { 0x28, BBRG_R8 },
+ { 0x20, BBRG_R9 },
+ { 0x18, BBRG_R10 },
+ { 0x10, BBRG_R11 },
+};
+static const struct bb_mem_contains no_memory[] = {
+};
+/* Hardware has already pushed an error_code on the stack. Use undefined just
+ * to set the initial stack offset.
+ */
+static const struct bb_mem_contains error_code[] = {
+ { 0x0, BBRG_UNDEFINED },
+};
+/* error_code plus original rax */
+static const struct bb_mem_contains error_code_rax[] = {
+ { 0x8, BBRG_UNDEFINED },
+ { 0x0, BBRG_RAX },
+};
+
+static const struct bb_reg_contains all_regs[] = {
+ [BBRG_RAX] = { BBRG_RAX, 0 },
+ [BBRG_RBX] = { BBRG_RBX, 0 },
+ [BBRG_RCX] = { BBRG_RCX, 0 },
+ [BBRG_RDX] = { BBRG_RDX, 0 },
+ [BBRG_RDI] = { BBRG_RDI, 0 },
+ [BBRG_RSI] = { BBRG_RSI, 0 },
+ [BBRG_RBP] = { BBRG_RBP, 0 },
+ [BBRG_RSP] = { BBRG_OSP, 0 },
+ [BBRG_R8 ] = { BBRG_R8, 0 },
+ [BBRG_R9 ] = { BBRG_R9, 0 },
+ [BBRG_R10] = { BBRG_R10, 0 },
+ [BBRG_R11] = { BBRG_R11, 0 },
+ [BBRG_R12] = { BBRG_R12, 0 },
+ [BBRG_R13] = { BBRG_R13, 0 },
+ [BBRG_R14] = { BBRG_R14, 0 },
+ [BBRG_R15] = { BBRG_R15, 0 },
+};
+static const struct bb_reg_contains no_regs[] = {
+};
+
+static struct bb_name_state bb_special_cases[] = {
+
+ /* First the cases that pass data only in memory. We do not check any
+ * register state for these cases.
+ */
+
+ /* Simple cases, no exceptions */
+ NS_MEM("ia32_ptregs_common", partial_pt_regs_plus_1, 0),
+ NS_MEM("ia32_sysret", partial_pt_regs, 0),
+ NS_MEM("int_careful", partial_pt_regs, 0),
+ NS_MEM("ia32_badarg", partial_pt_regs, 0),
+ NS_MEM("int_restore_rest", full_pt_regs, 0),
+ NS_MEM("int_signal", full_pt_regs, 0),
+ NS_MEM("int_very_careful", partial_pt_regs, 0),
+ NS_MEM("ptregscall_common", full_pt_regs_plus_1, 0),
+ NS_MEM("ret_from_intr", partial_pt_regs_plus_2, 0),
+ NS_MEM("stub32_clone", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub32_execve", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub32_fork", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub32_iopl", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub32_rt_sigreturn", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub32_sigaltstack", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub32_sigreturn", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub32_vfork", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub_clone", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub_execve", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub_fork", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub_iopl", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub_rt_sigreturn", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub_sigaltstack", partial_pt_regs_plus_1, 0),
+ NS_MEM("stub_vfork", partial_pt_regs_plus_1, 0),
+ NS_MEM("sysenter_auditsys", partial_pt_regs,
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11)),
+
+ NS_MEM("paranoid_exit", error_code_full_pt_regs, 0),
+
+ NS_MEM_FROM("ia32_badsys", "ia32_sysenter_target",
+ partial_pt_regs,
+ /* ia32_sysenter_target uses CLEAR_RREGS to clear R8-R11 on
+ * some paths. It also stomps on RAX.
+ */
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ BB_SKIP(RAX)),
+ NS_MEM_FROM("ia32_badsys", "ia32_cstar_target",
+ partial_pt_regs,
+ /* ia32_cstar_target uses CLEAR_RREGS to clear R8-R11 on some
+ * paths. It also stomps on RAX. Even more confusing, instead
+ * of storing RCX it stores RBP. WTF?
+ */
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+ NS_MEM_FROM("ia32_badsys", "ia32_syscall",
+ partial_pt_regs,
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11)),
+ NS_MEM("ia32_badsys", partial_pt_regs, 0),
+
+#ifdef CONFIG_AUDITSYSCALL
+ NS_MEM_FROM("int_with_check", "sysexit_audit", partial_pt_regs,
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ BB_SKIP(RAX)),
+ NS_MEM_FROM("int_with_check", "ia32_cstar_target", partial_pt_regs,
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+#endif
+ NS_MEM("int_with_check", no_memory, 0),
+
+ /* Various bits of code branch to int_ret_from_sys_call, with slightly
+ * different missing values in pt_regs.
+ */
+ NS_MEM_FROM("int_ret_from_sys_call", "ret_from_fork",
+ partial_pt_regs,
+ BB_SKIP(R11)),
+ NS_MEM_FROM("int_ret_from_sys_call", "stub_execve",
+ partial_pt_regs,
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+ NS_MEM_FROM("int_ret_from_sys_call", "stub_rt_sigreturn",
+ partial_pt_regs,
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+ NS_MEM_FROM("int_ret_from_sys_call", "kernel_execve",
+ partial_pt_regs,
+ BB_SKIP(RAX)),
+ NS_MEM_FROM("int_ret_from_sys_call", "ia32_syscall",
+ partial_pt_regs,
+ /* ia32_syscall only saves RDI through RCX. */
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ BB_SKIP(RAX)),
+ NS_MEM_FROM("int_ret_from_sys_call", "ia32_sysenter_target",
+ partial_pt_regs,
+ /* ia32_sysenter_target uses CLEAR_RREGS to clear R8-R11 on
+ * some paths. It also stomps on RAX.
+ */
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ BB_SKIP(RAX)),
+ NS_MEM_FROM("int_ret_from_sys_call", "ia32_cstar_target",
+ partial_pt_regs,
+ /* ia32_cstar_target uses CLEAR_RREGS to clear R8-R11 on some
+ * paths. It also stomps on RAX. Even more confusing, instead
+ * of storing RCX it stores RBP. WTF?
+ */
+ BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+ NS_MEM_FROM("int_ret_from_sys_call", "ia32_badsys",
+ partial_pt_regs, BB_SKIP(RAX)),
+ NS_MEM("int_ret_from_sys_call", partial_pt_regs, 0),
+
+#ifdef CONFIG_PREEMPT
+ NS_MEM("retint_kernel", partial_pt_regs, BB_SKIP(RAX)),
+#endif /* CONFIG_PREEMPT */
+
+ NS_MEM("retint_careful", partial_pt_regs, BB_SKIP(RAX)),
+
+ /* Horrible hack: For a brand new x86_64 task, switch_to() branches to
+ * ret_from_fork with a totally different stack state from all the
+ * other tasks that come out of switch_to(). This non-standard state
+ * cannot be represented so just ignore the branch from switch_to() to
+ * ret_from_fork. Due to inlining and linker labels, switch_to() can
+ * appear as several different function labels, including schedule,
+ * context_switch and __sched_text_start.
+ */
+ NS_MEM_FROM("ret_from_fork", "schedule", no_memory, 0),
+ NS_MEM_FROM("ret_from_fork", "__schedule", no_memory, 0),
+ NS_MEM_FROM("ret_from_fork", "__sched_text_start", no_memory, 0),
+ NS_MEM_FROM("ret_from_fork", "context_switch", no_memory, 0),
+ NS_MEM("ret_from_fork", full_pt_regs, 0),
+
+ NS_MEM_FROM("ret_from_sys_call", "ret_from_fork",
+ partial_pt_regs,
+ BB_SKIP(R11)),
+ NS_MEM("ret_from_sys_call", partial_pt_regs, 0),
+
+ NS_MEM("retint_restore_args",
+ partial_pt_regs,
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+
+ NS_MEM("retint_swapgs",
+ partial_pt_regs,
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+
+ /* Now the cases that pass data in registers. We do not check any
+ * memory state for these cases.
+ */
+
+ NS_REG("bad_put_user",
+ all_regs, BB_SKIP(RBX)),
+
+ NS_REG("bad_get_user",
+ all_regs, BB_SKIP(RAX) | BB_SKIP(RDX)),
+
+ NS_REG("bad_to_user",
+ all_regs,
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+
+ NS_REG("ia32_ptregs_common",
+ all_regs,
+ 0),
+
+ NS_REG("copy_user_generic_unrolled",
+ all_regs,
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+
+ NS_REG("copy_user_generic_string",
+ all_regs,
+ BB_SKIP(RAX) | BB_SKIP(RCX)),
+
+ NS_REG("irq_return",
+ all_regs,
+ 0),
+
+ /* Finally the cases that pass data in both registers and memory.
+ */
+
+ NS("invalid_TSS", error_code, all_regs, 0, 0, 0),
+ NS("segment_not_present", error_code, all_regs, 0, 0, 0),
+ NS("alignment_check", error_code, all_regs, 0, 0, 0),
+ NS("page_fault", error_code, all_regs, 0, 0, 0),
+ NS("general_protection", error_code, all_regs, 0, 0, 0),
+ NS("error_entry", error_code_rax, all_regs, 0, BB_SKIP(RAX), -0x10),
+ NS("error_exit", error_code_full_pt_regs, no_regs, 0, 0, 0x30),
+ NS("common_interrupt", error_code, all_regs, 0, 0, -0x8),
+ NS("save_args", error_code, all_regs, 0, 0, -0x50),
+ NS("int3", no_memory, all_regs, 0, 0, -0x80),
+};
+
+static const char *bb_spurious[] = {
+ /* schedule */
+ "thread_return",
+ /* system_call */
+ "system_call_after_swapgs",
+ "system_call_fastpath",
+ "ret_from_sys_call",
+ "sysret_check",
+ "sysret_careful",
+ "sysret_signal",
+ "badsys",
+#ifdef CONFIG_AUDITSYSCALL
+ "auditsys",
+ "sysret_audit",
+#endif
+ "tracesys",
+ "int_ret_from_sys_call",
+ "int_with_check",
+ "int_careful",
+ "int_very_careful",
+ "int_signal",
+ "int_restore_rest",
+ /* common_interrupt */
+ "ret_from_intr",
+ "exit_intr",
+ "retint_with_reschedule",
+ "retint_check",
+ "retint_swapgs",
+ "retint_restore_args",
+ "restore_args",
+ "irq_return",
+ "bad_iret",
+ "retint_careful",
+ "retint_signal",
+#ifdef CONFIG_PREEMPT
+ "retint_kernel",
+#endif /* CONFIG_PREEMPT */
+ /* paranoid_exit */
+ "paranoid_swapgs",
+ "paranoid_restore",
+ "paranoid_userspace",
+ "paranoid_schedule",
+ /* error_entry */
+ "error_swapgs",
+ "error_sti",
+ "error_kernelspace",
+ /* nmi */
+#ifdef CONFIG_TRACE_IRQFLAGS
+ "nmi_swapgs",
+ "nmi_restore",
+ "nmi_userspace",
+ "nmi_schedule",
+#endif
+ /* load_gs_index */
+ "gs_change",
+ "bad_gs",
+ /* ia32_sysenter_target */
+ "sysenter_do_call",
+ "sysenter_dispatch",
+ "sysexit_from_sys_call",
+#ifdef CONFIG_AUDITSYSCALL
+ "sysenter_auditsys",
+ "sysexit_audit",
+#endif
+ "sysenter_tracesys",
+ /* ia32_cstar_target */
+ "cstar_do_call",
+ "cstar_dispatch",
+ "sysretl_from_sys_call",
+#ifdef CONFIG_AUDITSYSCALL
+ "cstar_auditsys",
+ "sysretl_audit",
+#endif
+ "cstar_tracesys",
+ /* ia32_syscall */
+ "ia32_do_call",
+ "ia32_sysret",
+ "ia32_tracesys",
+#ifdef CONFIG_HIBERNATION
+ /* restore_image */
+ "loop",
+ "done",
+#endif /* CONFIG_HIBERNATION */
+#ifdef CONFIG_KPROBES
+ /* jprobe_return */
+ "jprobe_return_end",
+ /* kretprobe_trampoline_holder */
+ "kretprobe_trampoline",
+#endif /* CONFIG_KPROBES */
+#ifdef CONFIG_KEXEC
+ /* relocate_kernel */
+ "relocate_new_kernel",
+#endif /* CONFIG_KEXEC */
- #ifdef CONFIG_PARAVIRT_XEN
++#ifdef CONFIG_XEN
+ /* arch/i386/xen/xen-asm.S */
+ "xen_irq_enable_direct_end",
+ "xen_irq_disable_direct_end",
+ "xen_save_fl_direct_end",
+ "xen_restore_fl_direct_end",
+ "xen_iret_start_crit",
+ "iret_restore_end",
+ "xen_iret_end_crit",
+ "hyper_iret",
+#endif /* CONFIG_XEN */
+};
+
+static const char *bb_hardware_handlers[] = {
+ "system_call",
+ "common_interrupt",
+ "error_entry",
+ "debug",
+ "nmi",
+ "int3",
+ "double_fault",
+ "stack_segment",
+ "machine_check",
+ "kdb_call",
+};
+
+static int
+bb_hardware_pushed_arch(kdb_machreg_t rsp,
+ const struct kdb_activation_record *ar)
+{
+ /* x86_64 interrupt stacks are 16 byte aligned and you must get the
+ * next rsp from stack, it cannot be statically calculated. Do not
+ * include the word at rsp, it is pushed by hardware but is treated as
+ * a normal software return value.
+ *
+ * When an IST switch occurs (e.g. NMI) then the saved rsp points to
+ * another stack entirely. Assume that the IST stack is 16 byte
+ * aligned and just return the size of the hardware data on this stack.
+ * The stack unwind code will take care of the stack switch.
+ */
+ kdb_machreg_t saved_rsp = *((kdb_machreg_t *)rsp + 3);
+ int hardware_pushed = saved_rsp - rsp - KDB_WORD_SIZE;
+ if (hardware_pushed < 4 * KDB_WORD_SIZE ||
+ saved_rsp < ar->stack.logical_start ||
+ saved_rsp >= ar->stack.logical_end)
+ return 4 * KDB_WORD_SIZE;
+ else
+ return hardware_pushed;
+}
+
+static void
+bb_start_block0(void)
+{
+ bb_reg_code_set_value(BBRG_RAX, BBRG_RAX);
+ bb_reg_code_set_value(BBRG_RBX, BBRG_RBX);
+ bb_reg_code_set_value(BBRG_RCX, BBRG_RCX);
+ bb_reg_code_set_value(BBRG_RDX, BBRG_RDX);
+ bb_reg_code_set_value(BBRG_RDI, BBRG_RDI);
+ bb_reg_code_set_value(BBRG_RSI, BBRG_RSI);
+ bb_reg_code_set_value(BBRG_RBP, BBRG_RBP);
+ bb_reg_code_set_value(BBRG_RSP, BBRG_OSP);
+ bb_reg_code_set_value(BBRG_R8, BBRG_R8);
+ bb_reg_code_set_value(BBRG_R9, BBRG_R9);
+ bb_reg_code_set_value(BBRG_R10, BBRG_R10);
+ bb_reg_code_set_value(BBRG_R11, BBRG_R11);
+ bb_reg_code_set_value(BBRG_R12, BBRG_R12);
+ bb_reg_code_set_value(BBRG_R13, BBRG_R13);
+ bb_reg_code_set_value(BBRG_R14, BBRG_R14);
+ bb_reg_code_set_value(BBRG_R15, BBRG_R15);
+}
+
+/* x86_64 does not have a special case for __switch_to */
+
+static void
+bb_fixup_switch_to(char *p)
+{
+}
+
+static int
+bb_asmlinkage_arch(void)
+{
+ return strncmp(bb_func_name, "__down", 6) == 0 ||
+ strncmp(bb_func_name, "__up", 4) == 0 ||
+ strncmp(bb_func_name, "stub_", 5) == 0 ||
+ strcmp(bb_func_name, "ret_from_fork") == 0 ||
+ strcmp(bb_func_name, "ptregscall_common") == 0;
+}
+
+#else /* !CONFIG_X86_64 */
+
+/* Registers that can be used to pass parameters, in the order that parameters
+ * are passed.
+ */
+
+const static enum bb_reg_code
+bb_param_reg[] = {
+ BBRG_RAX,
+ BBRG_RDX,
+ BBRG_RCX,
+};
+
+const static enum bb_reg_code
+bb_preserved_reg[] = {
+ BBRG_RBX,
+ BBRG_RBP,
+ BBRG_RSP,
+ BBRG_RSI,
+ BBRG_RDI,
+};
+
+static const struct bb_mem_contains full_pt_regs[] = {
+ { 0x18, BBRG_RAX },
+ { 0x14, BBRG_RBP },
+ { 0x10, BBRG_RDI },
+ { 0x0c, BBRG_RSI },
+ { 0x08, BBRG_RDX },
+ { 0x04, BBRG_RCX },
+ { 0x00, BBRG_RBX },
+};
+static const struct bb_mem_contains no_memory[] = {
+};
+/* Hardware has already pushed an error_code on the stack. Use undefined just
+ * to set the initial stack offset.
+ */
+static const struct bb_mem_contains error_code[] = {
+ { 0x0, BBRG_UNDEFINED },
+};
+/* rbx already pushed */
+static const struct bb_mem_contains rbx_pushed[] = {
+ { 0x0, BBRG_RBX },
+};
+#ifdef CONFIG_MATH_EMULATION
+static const struct bb_mem_contains mem_fpu_reg_round[] = {
+ { 0xc, BBRG_RBP },
+ { 0x8, BBRG_RSI },
+ { 0x4, BBRG_RDI },
+ { 0x0, BBRG_RBX },
+};
+#endif /* CONFIG_MATH_EMULATION */
+
+static const struct bb_reg_contains all_regs[] = {
+ [BBRG_RAX] = { BBRG_RAX, 0 },
+ [BBRG_RBX] = { BBRG_RBX, 0 },
+ [BBRG_RCX] = { BBRG_RCX, 0 },
+ [BBRG_RDX] = { BBRG_RDX, 0 },
+ [BBRG_RDI] = { BBRG_RDI, 0 },
+ [BBRG_RSI] = { BBRG_RSI, 0 },
+ [BBRG_RBP] = { BBRG_RBP, 0 },
+ [BBRG_RSP] = { BBRG_OSP, 0 },
+};
+static const struct bb_reg_contains no_regs[] = {
+};
+#ifdef CONFIG_MATH_EMULATION
+static const struct bb_reg_contains reg_fpu_reg_round[] = {
+ [BBRG_RBP] = { BBRG_OSP, -0x4 },
+ [BBRG_RSP] = { BBRG_OSP, -0x10 },
+};
+#endif /* CONFIG_MATH_EMULATION */
+
+static struct bb_name_state bb_special_cases[] = {
+
+ /* First the cases that pass data only in memory. We do not check any
+ * register state for these cases.
+ */
+
+ /* Simple cases, no exceptions */
+ NS_MEM("check_userspace", full_pt_regs, 0),
+ NS_MEM("device_not_available_emulate", full_pt_regs, 0),
+ NS_MEM("ldt_ss", full_pt_regs, 0),
+ NS_MEM("no_singlestep", full_pt_regs, 0),
+ NS_MEM("restore_all", full_pt_regs, 0),
+ NS_MEM("restore_nocheck", full_pt_regs, 0),
+ NS_MEM("restore_nocheck_notrace", full_pt_regs, 0),
+ NS_MEM("ret_from_exception", full_pt_regs, 0),
+ NS_MEM("ret_from_fork", full_pt_regs, 0),
+ NS_MEM("ret_from_intr", full_pt_regs, 0),
+ NS_MEM("work_notifysig", full_pt_regs, 0),
+ NS_MEM("work_pending", full_pt_regs, 0),
+
+#ifdef CONFIG_PREEMPT
+ NS_MEM("resume_kernel", full_pt_regs, 0),
+#endif /* CONFIG_PREEMPT */
+
+ NS_MEM("common_interrupt", error_code, 0),
+ NS_MEM("error_code", error_code, 0),
+
+ NS_MEM("bad_put_user", rbx_pushed, 0),
+
+ NS_MEM_FROM("resume_userspace", "syscall_badsys",
+ full_pt_regs, BB_SKIP(RAX)),
+ NS_MEM_FROM("resume_userspace", "syscall_fault",
+ full_pt_regs, BB_SKIP(RAX)),
+ NS_MEM_FROM("resume_userspace", "syscall_trace_entry",
+ full_pt_regs, BB_SKIP(RAX)),
+ /* Too difficult to trace through the various vm86 functions for now.
+ * They are C functions that start off with some memory state, fiddle
+ * the registers then jmp directly to resume_userspace. For the
+ * moment, just assume that they are valid and do no checks.
+ */
+ NS_FROM("resume_userspace", "do_int",
+ no_memory, no_regs, 0, 0, 0),
+ NS_FROM("resume_userspace", "do_sys_vm86",
+ no_memory, no_regs, 0, 0, 0),
+ NS_FROM("resume_userspace", "handle_vm86_fault",
+ no_memory, no_regs, 0, 0, 0),
+ NS_FROM("resume_userspace", "handle_vm86_trap",
+ no_memory, no_regs, 0, 0, 0),
+ NS_MEM("resume_userspace", full_pt_regs, 0),
+
+ NS_MEM_FROM("syscall_badsys", "ia32_sysenter_target",
+ full_pt_regs, BB_SKIP(RBP)),
+ NS_MEM("syscall_badsys", full_pt_regs, 0),
+
+ NS_MEM_FROM("syscall_call", "syscall_trace_entry",
+ full_pt_regs, BB_SKIP(RAX)),
+ NS_MEM("syscall_call", full_pt_regs, 0),
+
+ NS_MEM_FROM("syscall_exit", "syscall_trace_entry",
+ full_pt_regs, BB_SKIP(RAX)),
+ NS_MEM("syscall_exit", full_pt_regs, 0),
+
+ NS_MEM_FROM("syscall_exit_work", "ia32_sysenter_target",
+ full_pt_regs, BB_SKIP(RAX) | BB_SKIP(RBP)),
+ NS_MEM_FROM("syscall_exit_work", "system_call",
+ full_pt_regs, BB_SKIP(RAX)),
+ NS_MEM("syscall_exit_work", full_pt_regs, 0),
+
+ NS_MEM_FROM("syscall_trace_entry", "ia32_sysenter_target",
+ full_pt_regs, BB_SKIP(RBP)),
+ NS_MEM_FROM("syscall_trace_entry", "system_call",
+ full_pt_regs, BB_SKIP(RAX)),
+ NS_MEM("syscall_trace_entry", full_pt_regs, 0),
+
+ /* Now the cases that pass data in registers. We do not check any
+ * memory state for these cases.
+ */
+
+ NS_REG("syscall_fault", all_regs, 0),
+
+ NS_REG("bad_get_user", all_regs,
+ BB_SKIP(RAX) | BB_SKIP(RDX)),
+
+ /* Finally the cases that pass data in both registers and memory.
+ */
+
+ /* This entry is redundant now because bb_fixup_switch_to() hides the
+ * jmp __switch_to case, however the entry is left here as
+ * documentation.
+ *
+ * NS("__switch_to", no_memory, no_regs, 0, 0, 0),
+ */
+
+ NS("iret_exc", no_memory, all_regs, 0, 0, 0x20),
+
+#ifdef CONFIG_MATH_EMULATION
+ NS("fpu_reg_round", mem_fpu_reg_round, reg_fpu_reg_round, 0, 0, 0),
+#endif /* CONFIG_MATH_EMULATION */
+};
+
+static const char *bb_spurious[] = {
+ /* ret_from_exception */
+ "ret_from_intr",
+ "check_userspace",
+ "resume_userspace",
+ /* resume_kernel */
+#ifdef CONFIG_PREEMPT
+ "need_resched",
+#endif /* CONFIG_PREEMPT */
+ /* ia32_sysenter_target */
+ "sysenter_past_esp",
+ /* system_call */
+ "no_singlestep",
+ "syscall_call",
+ "syscall_exit",
+ "restore_all",
+ "restore_nocheck",
+ "restore_nocheck_notrace",
+ "ldt_ss",
+ /* do not include iret_exc, it is in a .fixup section */
+ /* work_pending */
+ "work_resched",
+ "work_notifysig",
+#ifdef CONFIG_VM86
+ "work_notifysig_v86",
+#endif /* CONFIG_VM86 */
+ /* page_fault */
+ "error_code",
+ /* device_not_available */
+ "device_not_available_emulate",
+ /* debug */
+ "debug_esp_fix_insn",
+ "debug_stack_correct",
+ /* nmi */
+ "nmi_stack_correct",
+ "nmi_stack_fixup",
+ "nmi_debug_stack_check",
+ "nmi_espfix_stack",
+#ifdef CONFIG_HIBERNATION
+ /* restore_image */
+ "copy_loop",
+ "done",
+#endif /* CONFIG_HIBERNATION */
+#ifdef CONFIG_KPROBES
+ /* jprobe_return */
+ "jprobe_return_end",
+#endif /* CONFIG_KPROBES */
+#ifdef CONFIG_KEXEC
+ /* relocate_kernel */
+ "relocate_new_kernel",
+#endif /* CONFIG_KEXEC */
+#ifdef CONFIG_MATH_EMULATION
+ /* assorted *.S files in arch/i386/math_emu */
+ "Denorm_done",
+ "Denorm_shift_more_than_32",
+ "Denorm_shift_more_than_63",
+ "Denorm_shift_more_than_64",
+ "Do_unmasked_underflow",
+ "Exp_not_underflow",
+ "fpu_Arith_exit",
+ "fpu_reg_round",
+ "fpu_reg_round_signed_special_exit",
+ "fpu_reg_round_special_exit",
+ "L_accum_done",
+ "L_accum_loaded",
+ "L_accum_loop",
+ "L_arg1_larger",
+ "L_bugged",
+ "L_bugged_1",
+ "L_bugged_2",
+ "L_bugged_3",
+ "L_bugged_4",
+ "L_bugged_denorm_486",
+ "L_bugged_round24",
+ "L_bugged_round53",
+ "L_bugged_round64",
+ "LCheck_24_round_up",
+ "LCheck_53_round_up",
+ "LCheck_Round_Overflow",
+ "LCheck_truncate_24",
+ "LCheck_truncate_53",
+ "LCheck_truncate_64",
+ "LDenormal_adj_exponent",
+ "L_deNormalised",
+ "LDo_24_round_up",
+ "LDo_2nd_32_bits",
+ "LDo_2nd_div",
+ "LDo_3rd_32_bits",
+ "LDo_3rd_div",
+ "LDo_53_round_up",
+ "LDo_64_round_up",
+ "L_done",
+ "LDo_truncate_24",
+ "LDown_24",
+ "LDown_53",
+ "LDown_64",
+ "L_entry_bugged",
+ "L_error_exit",
+ "L_exactly_32",
+ "L_exception_exit",
+ "L_exit",
+ "L_exit_nuo_valid",
+ "L_exit_nuo_zero",
+ "L_exit_valid",
+ "L_extent_zero",
+ "LFirst_div_done",
+ "LFirst_div_not_1",
+ "L_Full_Division",
+ "LGreater_Half_24",
+ "LGreater_Half_53",
+ "LGreater_than_1",
+ "LLess_than_1",
+ "L_Make_denorm",
+ "L_more_31_no_low",
+ "L_more_63_no_low",
+ "L_more_than_31",
+ "L_more_than_63",
+ "L_more_than_64",
+ "L_more_than_65",
+ "L_more_than_95",
+ "L_must_be_zero",
+ "L_n_exit",
+ "L_no_adjust",
+ "L_no_bit_lost",
+ "L_no_overflow",
+ "L_no_precision_loss",
+ "L_Normalised",
+ "L_norm_bugged",
+ "L_n_shift_1",
+ "L_nuo_shift_1",
+ "L_overflow",
+ "L_precision_lost_down",
+ "L_precision_lost_up",
+ "LPrevent_2nd_overflow",
+ "LPrevent_3rd_overflow",
+ "LPseudoDenormal",
+ "L_Re_normalise",
+ "LResult_Normalised",
+ "L_round",
+ "LRound_large",
+ "LRound_nearest_24",
+ "LRound_nearest_53",
+ "LRound_nearest_64",
+ "LRound_not_small",
+ "LRound_ovfl",
+ "LRound_precision",
+ "LRound_prep",
+ "L_round_the_result",
+ "LRound_To_24",
+ "LRound_To_53",
+ "LRound_To_64",
+ "LSecond_div_done",
+ "LSecond_div_not_1",
+ "L_shift_1",
+ "L_shift_32",
+ "L_shift_65_nc",
+ "L_shift_done",
+ "Ls_less_than_32",
+ "Ls_more_than_63",
+ "Ls_more_than_95",
+ "L_Store_significand",
+ "L_subtr",
+ "LTest_over",
+ "LTruncate_53",
+ "LTruncate_64",
+ "L_underflow",
+ "L_underflow_to_zero",
+ "LUp_24",
+ "LUp_53",
+ "LUp_64",
+ "L_zero",
+ "Normalise_result",
+ "Signal_underflow",
+ "sqrt_arg_ge_2",
+ "sqrt_get_more_precision",
+ "sqrt_more_prec_large",
+ "sqrt_more_prec_ok",
+ "sqrt_more_prec_small",
+ "sqrt_near_exact",
+ "sqrt_near_exact_large",
+ "sqrt_near_exact_ok",
+ "sqrt_near_exact_small",
+ "sqrt_near_exact_x",
+ "sqrt_prelim_no_adjust",
+ "sqrt_round_result",
+ "sqrt_stage_2_done",
+ "sqrt_stage_2_error",
+ "sqrt_stage_2_finish",
+ "sqrt_stage_2_positive",
+ "sqrt_stage_3_error",
+ "sqrt_stage_3_finished",
+ "sqrt_stage_3_no_error",
+ "sqrt_stage_3_positive",
+ "Unmasked_underflow",
+ "xExp_not_underflow",
+#endif /* CONFIG_MATH_EMULATION */
+};
+
+static const char *bb_hardware_handlers[] = {
+ "ret_from_exception",
+ "system_call",
+ "work_pending",
+ "syscall_fault",
+ "page_fault",
+ "coprocessor_error",
+ "simd_coprocessor_error",
+ "device_not_available",
+ "debug",
+ "nmi",
+ "int3",
+ "overflow",
+ "bounds",
+ "invalid_op",
+ "coprocessor_segment_overrun",
+ "invalid_TSS",
+ "segment_not_present",
+ "stack_segment",
+ "general_protection",
+ "alignment_check",
+ "kdb_call",
+ "divide_error",
+ "machine_check",
+ "spurious_interrupt_bug",
+};
+
+static int
+bb_hardware_pushed_arch(kdb_machreg_t rsp,
+ const struct kdb_activation_record *ar)
+{
+ return (2 * KDB_WORD_SIZE);
+}
+
+static void
+bb_start_block0(void)
+{
+ bb_reg_code_set_value(BBRG_RAX, BBRG_RAX);
+ bb_reg_code_set_value(BBRG_RBX, BBRG_RBX);
+ bb_reg_code_set_value(BBRG_RCX, BBRG_RCX);
+ bb_reg_code_set_value(BBRG_RDX, BBRG_RDX);
+ bb_reg_code_set_value(BBRG_RDI, BBRG_RDI);
+ bb_reg_code_set_value(BBRG_RSI, BBRG_RSI);
+ bb_reg_code_set_value(BBRG_RBP, BBRG_RBP);
+ bb_reg_code_set_value(BBRG_RSP, BBRG_OSP);
+}
+
+/* The i386 code that switches stack in a context switch is an extremely
+ * special case. It saves the rip pointing to a label that is not otherwise
+ * referenced, saves the current rsp then pushes a word. The magic code that
+ * resumes the new task picks up the saved rip and rsp, effectively referencing
+ * a label that otherwise is not used and ignoring the pushed word.
+ *
+ * The simplest way to handle this very strange case is to recognise jmp
+ * address <__switch_to> and treat it as a popfl instruction. This avoids
+ * terminating the block on this jmp and removes one word from the stack state,
+ * which is the end effect of all the magic code.
+ *
+ * Called with the instruction line, starting after the first ':'.
+ */
+
+static void
+bb_fixup_switch_to(char *p)
+{
+ char *p1 = p;
+ p += strspn(p, " \t"); /* start of instruction */
+ if (strncmp(p, "jmp", 3))
+ return;
+ p += strcspn(p, " \t"); /* end of instruction */
+ p += strspn(p, " \t"); /* start of address */
+ p += strcspn(p, " \t"); /* end of address */
+ p += strspn(p, " \t"); /* start of comment */
+ if (strcmp(p, "<__switch_to>") == 0)
+ strcpy(p1, "popfl");
+}
+
+static int
+bb_asmlinkage_arch(void)
+{
+ return strcmp(bb_func_name, "ret_from_exception") == 0 ||
+ strcmp(bb_func_name, "syscall_trace_entry") == 0;
+}
+
+#endif /* CONFIG_X86_64 */
+
+
+/*============================================================================*/
+/* */
+/* Common code and data. */
+/* */
+/*============================================================================*/
+
+
+/* Tracking registers by decoding the instructions is quite a bit harder than
+ * doing the same tracking using compiler generated information. Register
+ * contents can remain in the same register, they can be copied to other
+ * registers, they can be stored on stack or they can be modified/overwritten.
+ * At any one time, there are 0 or more copies of the original value that was
+ * supplied in each register on input to the current function. If a register
+ * exists in multiple places, one copy of that register is the master version,
+ * the others are temporary copies which may or may not be destroyed before the
+ * end of the function.
+ *
+ * The compiler knows which copy of a register is the master and which are
+ * temporary copies, which makes it relatively easy to track register contents
+ * as they are saved and restored. Without that compiler based knowledge, this
+ * code has to track _every_ possible copy of each register, simply because we
+ * do not know which is the master copy and which are temporary copies which
+ * may be destroyed later.
+ *
+ * It gets worse: registers that contain parameters can be copied to other
+ * registers which are then saved on stack in a lower level function. Also the
+ * stack pointer may be held in multiple registers (typically RSP and RBP)
+ * which contain different offsets from the base of the stack on entry to this
+ * function. All of which means that we have to track _all_ register
+ * movements, or at least as much as possible.
+ *
+ * Start with the basic block that contains the start of the function, by
+ * definition all registers contain their initial value. Track each
+ * instruction's effect on register contents, this includes reading from a
+ * parameter register before any write to that register, IOW the register
+ * really does contain a parameter. The register state is represented by a
+ * dynamically sized array with each entry containing :-
+ *
+ * Register name
+ * Location it is copied to (another register or stack + offset)
+ *
+ * Besides the register tracking array, we track which parameter registers are
+ * read before being written, to determine how many parameters are passed in
+ * registers. We also track which registers contain stack pointers, including
+ * their offset from the original stack pointer on entry to the function.
+ *
+ * At each exit from the current basic block (via JMP instruction or drop
+ * through), the register state is cloned to form the state on input to the
+ * target basic block and the target is marked for processing using this state.
+ * When there are multiple ways to enter a basic block (e.g. several JMP
+ * instructions referencing the same target) then there will be multiple sets
+ * of register state to form the "input" for that basic block, there is no
+ * guarantee that all paths to that block will have the same register state.
+ *
+ * As each target block is processed, all the known sets of register state are
+ * merged to form a suitable subset of the state which agrees with all the
+ * inputs. The most common case is where one path to this block copies a
+ * register to another register but another path does not, therefore the copy
+ * is only a temporary and should not be propogated into this block.
+ *
+ * If the target block already has an input state from the current transfer
+ * point and the new input state is identical to the previous input state then
+ * we have reached a steady state for the arc from the current location to the
+ * target block. Therefore there is no need to process the target block again.
+ *
+ * The steps of "process a block, create state for target block(s), pick a new
+ * target block, merge state for target block, process target block" will
+ * continue until all the state changes have propogated all the way down the
+ * basic block tree, including round any cycles in the tree. The merge step
+ * only deletes tracking entries from the input state(s), it never adds a
+ * tracking entry. Therefore the overall algorithm is guaranteed to converge
+ * to a steady state, the worst possible case is that every tracking entry into
+ * a block is deleted, which will result in an empty output state.
+ *
+ * As each instruction is decoded, it is checked to see if this is the point at
+ * which execution left this function. This can be a call to another function
+ * (actually the return address to this function) or is the instruction which
+ * was about to be executed when an interrupt occurred (including an oops).
+ * Save the register state at this point.
+ *
+ * We always know what the registers contain when execution left this function.
+ * For an interrupt, the registers are in struct pt_regs. For a call to
+ * another function, we have already deduced the register state on entry to the
+ * other function by unwinding to the start of that function. Given the
+ * register state on exit from this function plus the known register contents
+ * on entry to the next function, we can determine the stack pointer value on
+ * input to this function. That in turn lets us calculate the address of input
+ * registers that have been stored on stack, giving us the input parameters.
+ * Finally the stack pointer gives us the return address which is the exit
+ * point from the calling function, repeat the unwind process on that function.
+ *
+ * The data that tracks which registers contain input parameters is function
+ * global, not local to any basic block. To determine which input registers
+ * contain parameters, we have to decode the entire function. Otherwise an
+ * exit early in the function might not have read any parameters yet.
+ */
+
+/* Record memory contents in terms of the values that were passed to this
+ * function, IOW track which memory locations contain an input value. A memory
+ * location's contents can be undefined, it can contain an input register value
+ * or it can contain an offset from the original stack pointer.
+ *
+ * This structure is used to record register contents that have been stored in
+ * memory. Location (BBRG_OSP + 'offset_address') contains the input value
+ * from register 'value'. When 'value' is BBRG_OSP then offset_value contains
+ * the offset from the original stack pointer that was stored in this memory
+ * location. When 'value' is not BBRG_OSP then the memory location contains
+ * the original contents of an input register and offset_value is ignored.
+ *
+ * An input register 'value' can be stored in more than one register and/or in
+ * more than one memory location.
+ */
+
+struct bb_memory_contains
+{
+ short offset_address;
+ enum bb_reg_code value: 8;
+ short offset_value;
+};
+
+/* Track the register state in each basic block. */
+
+struct bb_reg_state
+{
+ /* Indexed by register value 'reg - BBRG_RAX' */
+ struct bb_reg_contains contains[KDB_INT_REGISTERS];
+ int ref_count;
+ int mem_count;
+ /* dynamic size for memory locations, see mem_count */
+ struct bb_memory_contains memory[0];
+};
+
+static struct bb_reg_state *bb_reg_state, *bb_exit_state;
+static int bb_reg_state_max, bb_reg_params, bb_memory_params;
+
+struct bb_actual
+{
+ bfd_vma value;
+ int valid;
+};
+
+/* Contains the actual hex value of a register, plus a valid bit. Indexed by
+ * register value 'reg - BBRG_RAX'
+ */
+static struct bb_actual bb_actual[KDB_INT_REGISTERS];
+
+static bfd_vma bb_func_start, bb_func_end;
+static bfd_vma bb_common_interrupt, bb_error_entry, bb_ret_from_intr,
+ bb_thread_return, bb_sync_regs, bb_save_v86_state,
+ bb__sched_text_start, bb__sched_text_end,
+ bb_save_args, bb_save_rest, bb_save_paranoid;
+
+/* Record jmp instructions, both conditional and unconditional. These form the
+ * arcs between the basic blocks. This is also used to record the state when
+ * one block drops through into the next.
+ *
+ * A bb can have multiple associated bb_jmp entries, one for each jcc
+ * instruction plus at most one bb_jmp for the drop through case. If a bb
+ * drops through to the next bb then the drop through bb_jmp entry will be the
+ * last entry in the set of bb_jmp's that are associated with the bb. This is
+ * enforced by the fact that jcc entries are added during the disassembly phase
+ * of pass 1, the drop through entries are added near the end of pass 1.
+ *
+ * At address 'from' in this block, we have a jump to address 'to'. The
+ * register state at 'from' is copied to the target block.
+ */
+
+struct bb_jmp
+{
+ bfd_vma from;
+ bfd_vma to;
+ struct bb_reg_state *state;
+ unsigned int drop_through: 1;
+};
+
+struct bb
+{
+ bfd_vma start;
+ /* The end address of a basic block is sloppy. It can be the first
+ * byte of the last instruction in the block or it can be the last byte
+ * of the block.
+ */
+ bfd_vma end;
+ unsigned int changed: 1;
+ unsigned int drop_through: 1;
+};
+
+static struct bb **bb_list, *bb_curr;
+static int bb_max, bb_count;
+
+static struct bb_jmp *bb_jmp_list;
+static int bb_jmp_max, bb_jmp_count;
+
+/* Add a new bb entry to the list. This does an insert sort. */
+
+static struct bb *
+bb_new(bfd_vma order)
+{
+ int i, j;
+ struct bb *bb, *p;
+ if (bb_giveup)
+ return NULL;
+ if (bb_count == bb_max) {
+ struct bb **bb_list_new;
+ bb_max += 10;
+ bb_list_new = debug_kmalloc(bb_max*sizeof(*bb_list_new),
+ GFP_ATOMIC);
+ if (!bb_list_new) {
+ kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ bb_giveup = 1;
+ return NULL;
+ }
+ memcpy(bb_list_new, bb_list, bb_count*sizeof(*bb_list));
+ debug_kfree(bb_list);
+ bb_list = bb_list_new;
+ }
+ bb = debug_kmalloc(sizeof(*bb), GFP_ATOMIC);
+ if (!bb) {
+ kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ bb_giveup = 1;
+ return NULL;
+ }
+ memset(bb, 0, sizeof(*bb));
+ for (i = 0; i < bb_count; ++i) {
+ p = bb_list[i];
+ if ((p->start && p->start > order) ||
+ (p->end && p->end > order))
+ break;
+ }
+ for (j = bb_count-1; j >= i; --j)
+ bb_list[j+1] = bb_list[j];
+ bb_list[i] = bb;
+ ++bb_count;
+ return bb;
+}
+
+/* Add a new bb_jmp entry to the list. This list is not sorted. */
+
+static struct bb_jmp *
+bb_jmp_new(bfd_vma from, bfd_vma to, unsigned int drop_through)
+{
+ struct bb_jmp *bb_jmp;
+ if (bb_giveup)
+ return NULL;
+ if (bb_jmp_count == bb_jmp_max) {
+ struct bb_jmp *bb_jmp_list_new;
+ bb_jmp_max += 10;
+ bb_jmp_list_new =
+ debug_kmalloc(bb_jmp_max*sizeof(*bb_jmp_list_new),
+ GFP_ATOMIC);
+ if (!bb_jmp_list_new) {
+ kdb_printf("\n\n%s: out of debug_kmalloc\n",
+ __FUNCTION__);
+ bb_giveup = 1;
+ return NULL;
+ }
+ memcpy(bb_jmp_list_new, bb_jmp_list,
+ bb_jmp_count*sizeof(*bb_jmp_list));
+ debug_kfree(bb_jmp_list);
+ bb_jmp_list = bb_jmp_list_new;
+ }
+ bb_jmp = bb_jmp_list + bb_jmp_count++;
+ bb_jmp->from = from;
+ bb_jmp->to = to;
+ bb_jmp->drop_through = drop_through;
+ bb_jmp->state = NULL;
+ return bb_jmp;
+}
+
+static void
+bb_delete(int i)
+{
+ struct bb *bb = bb_list[i];
+ memcpy(bb_list+i, bb_list+i+1, (bb_count-i-1)*sizeof(*bb_list));
+ bb_list[--bb_count] = NULL;
+ debug_kfree(bb);
+}
+
+static struct bb *
+bb_add(bfd_vma start, bfd_vma end)
+{
+ int i;
+ struct bb *bb;
+ /* Ignore basic blocks whose start address is outside the current
+ * function. These occur for call instructions and for tail recursion.
+ */
+ if (start &&
+ (start < bb_func_start || start >= bb_func_end))
+ return NULL;
+ for (i = 0; i < bb_count; ++i) {
+ bb = bb_list[i];
+ if ((start && bb->start == start) ||
+ (end && bb->end == end))
+ return bb;
+ }
+ bb = bb_new(start ? start : end);
+ if (bb) {
+ bb->start = start;
+ bb->end = end;
+ }
+ return bb;
+}
+
+static struct bb_jmp *
+bb_jmp_add(bfd_vma from, bfd_vma to, unsigned int drop_through)
+{
+ int i;
+ struct bb_jmp *bb_jmp;
+ for (i = 0, bb_jmp = bb_jmp_list; i < bb_jmp_count; ++i, ++bb_jmp) {
+ if (bb_jmp->from == from &&
+ bb_jmp->to == to &&
+ bb_jmp->drop_through == drop_through)
+ return bb_jmp;
+ }
+ bb_jmp = bb_jmp_new(from, to, drop_through);
+ return bb_jmp;
+}
+
+static unsigned long bb_curr_addr, bb_exit_addr;
+static char bb_buffer[256]; /* A bit too big to go on stack */
+
+/* Computed jmp uses 'jmp *addr(,%reg,[48])' where 'addr' is the start of a
+ * table of addresses that point into the current function. Run the table and
+ * generate bb starts for each target address plus a bb_jmp from this address
+ * to the target address.
+ *
+ * Only called for 'jmp' instructions, with the pointer starting at 'jmp'.
+ */
+
+static void
+bb_pass1_computed_jmp(char *p)
+{
+ unsigned long table, scale;
+ kdb_machreg_t addr;
+ struct bb* bb;
+ p += strcspn(p, " \t"); /* end of instruction */
+ p += strspn(p, " \t"); /* start of address */
+ if (*p++ != '*')
+ return;
+ table = simple_strtoul(p, &p, 0);
+ if (strncmp(p, "(,%", 3) != 0)
+ return;
+ p += 3;
+ p += strcspn(p, ","); /* end of reg */
+ if (*p++ != ',')
+ return;
+ scale = simple_strtoul(p, &p, 0);
+ if (scale != KDB_WORD_SIZE || strcmp(p, ")"))
+ return;
+ while (!bb_giveup) {
+ if (kdb_getword(&addr, table, sizeof(addr)))
+ return;
+ if (addr < bb_func_start || addr >= bb_func_end)
+ return;
+ bb = bb_add(addr, 0);
+ if (bb)
+ bb_jmp_add(bb_curr_addr, addr, 0);
+ table += KDB_WORD_SIZE;
+ }
+}
+
+/* Pass 1, identify the start and end of each basic block */
+
+static int
+bb_dis_pass1(PTR file, const char *fmt, ...)
+{
+ int l = strlen(bb_buffer);
+ char *p;
+ va_list ap;
+ va_start(ap, fmt);
+ vsnprintf(bb_buffer + l, sizeof(bb_buffer) - l, fmt, ap);
+ va_end(ap);
+ if ((p = strchr(bb_buffer, '\n'))) {
+ *p = '\0';
+ /* ret[q], iret[q], sysexit, sysret, ud2a or jmp[q] end a
+ * block. As does a call to a function marked noret.
+ */
+ p = bb_buffer;
+ p += strcspn(p, ":");
+ if (*p++ == ':') {
+ bb_fixup_switch_to(p);
+ p += strspn(p, " \t"); /* start of instruction */
+ if (strncmp(p, "ret", 3) == 0 ||
+ strncmp(p, "iret", 4) == 0 ||
+ strncmp(p, "sysexit", 7) == 0 ||
+ strncmp(p, "sysret", 6) == 0 ||
+ strncmp(p, "ud2a", 4) == 0 ||
+ strncmp(p, "jmp", 3) == 0) {
+ if (strncmp(p, "jmp", 3) == 0)
+ bb_pass1_computed_jmp(p);
+ bb_add(0, bb_curr_addr);
+ };
+ if (strncmp(p, "call", 4) == 0) {
+ strsep(&p, " \t"); /* end of opcode */
+ if (p)
+ p += strspn(p, " \t"); /* operand(s) */
+ if (p && strchr(p, '<')) {
+ p = strchr(p, '<') + 1;
+ *strchr(p, '>') = '\0';
+ if (bb_noret(p))
+ bb_add(0, bb_curr_addr);
+ }
+ };
+ }
+ bb_buffer[0] = '\0';
+ }
+ return 0;
+}
+
+static void
+bb_printaddr_pass1(bfd_vma addr, disassemble_info *dip)
+{
+ kdb_symtab_t symtab;
+ unsigned int offset;
+ struct bb* bb;
+ /* disasm only calls the printaddr routine for the target of jmp, loop
+ * or call instructions, i.e. the start of a basic block. call is
+ * ignored by bb_add because the target address is outside the current
+ * function.
+ */
+ dip->fprintf_func(dip->stream, "0x%lx", addr);
+ kdbnearsym(addr, &symtab);
+ if (symtab.sym_name) {
+ dip->fprintf_func(dip->stream, " <%s", symtab.sym_name);
+ if ((offset = addr - symtab.sym_start))
+ dip->fprintf_func(dip->stream, "+0x%x", offset);
+ dip->fprintf_func(dip->stream, ">");
+ }
+ bb = bb_add(addr, 0);
+ if (bb)
+ bb_jmp_add(bb_curr_addr, addr, 0);
+}
+
+static void
+bb_pass1(void)
+{
+ int i;
+ unsigned long addr;
+ struct bb *bb;
+ struct bb_jmp *bb_jmp;
+
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ kdb_printf("%s: func_name %s func_start " kdb_bfd_vma_fmt0
+ " func_end " kdb_bfd_vma_fmt0 "\n",
+ __FUNCTION__,
+ bb_func_name,
+ bb_func_start,
+ bb_func_end);
+ kdb_di.fprintf_func = bb_dis_pass1;
+ kdb_di.print_address_func = bb_printaddr_pass1;
+
+ bb_add(bb_func_start, 0);
+ for (bb_curr_addr = bb_func_start;
+ bb_curr_addr < bb_func_end;
+ ++bb_curr_addr) {
+ unsigned char c;
+ if (kdb_getarea(c, bb_curr_addr)) {
+ kdb_printf("%s: unreadable function code at ",
+ __FUNCTION__);
+ kdb_symbol_print(bb_curr_addr, NULL, KDB_SP_DEFAULT);
+ kdb_printf(", giving up\n");
+ bb_giveup = 1;
+ return;
+ }
+ }
+ for (addr = bb_func_start; addr < bb_func_end; ) {
+ bb_curr_addr = addr;
+ addr += kdba_id_printinsn(addr, &kdb_di);
+ kdb_di.fprintf_func(NULL, "\n");
+ }
+ if (bb_giveup)
+ goto out;
+
+ /* Special case: a block consisting of a single instruction which is
+ * both the target of a jmp and is also an ending instruction, so we
+ * add two blocks using the same address, one as a start and one as an
+ * end, in no guaranteed order. The end must be ordered after the
+ * start.
+ */
+ for (i = 0; i < bb_count-1; ++i) {
+ struct bb *bb1 = bb_list[i], *bb2 = bb_list[i+1];
+ if (bb1->end && bb1->end == bb2->start) {
+ bb = bb_list[i+1];
+ bb_list[i+1] = bb_list[i];
+ bb_list[i] = bb;
+ }
+ }
+
+ /* Some bb have a start address, some have an end address. Collapse
+ * them into entries that have both start and end addresses. The first
+ * entry is guaranteed to have a start address.
+ */
+ for (i = 0; i < bb_count-1; ++i) {
+ struct bb *bb1 = bb_list[i], *bb2 = bb_list[i+1];
+ if (bb1->end)
+ continue;
+ if (bb2->start) {
+ bb1->end = bb2->start - 1;
+ bb1->drop_through = 1;
+ bb_jmp_add(bb1->end, bb2->start, 1);
+ } else {
+ bb1->end = bb2->end;
+ bb_delete(i+1);
+ }
+ }
+ bb = bb_list[bb_count-1];
+ if (!bb->end)
+ bb->end = bb_func_end - 1;
+
+ /* It would be nice to check that all bb have a valid start and end
+ * address but there is just too much garbage code in the kernel to do
+ * that check. Aligned functions in assembler code mean that there is
+ * space between the end of one function and the start of the next and
+ * that space contains previous code from the assembler's buffers. It
+ * looks like dead code with nothing that branches to it, so no start
+ * address. do_sys_vm86() ends with 'jmp resume_userspace' which the C
+ * compiler does not know about so gcc appends the normal exit code,
+ * again nothing branches to this dangling code.
+ *
+ * The best we can do is delete bb entries with no start address.
+ */
+ for (i = 0; i < bb_count; ++i) {
+ struct bb *bb = bb_list[i];
+ if (!bb->start)
+ bb_delete(i--);
+ }
+ for (i = 0; i < bb_count; ++i) {
+ struct bb *bb = bb_list[i];
+ if (!bb->end) {
+ kdb_printf("%s: incomplete bb state\n", __FUNCTION__);
+ bb_giveup = 1;
+ goto debug;
+ }
+ }
+
+out:
+ if (!KDB_DEBUG(BB))
+ return;
+debug:
+ kdb_printf("%s: end\n", __FUNCTION__);
+ for (i = 0; i < bb_count; ++i) {
+ bb = bb_list[i];
+ kdb_printf(" bb[%d] start "
+ kdb_bfd_vma_fmt0
+ " end " kdb_bfd_vma_fmt0
+ " drop_through %d",
+ i, bb->start, bb->end, bb->drop_through);
+ kdb_printf("\n");
+ }
+ for (i = 0; i < bb_jmp_count; ++i) {
+ bb_jmp = bb_jmp_list + i;
+ kdb_printf(" bb_jmp[%d] from "
+ kdb_bfd_vma_fmt0
+ " to " kdb_bfd_vma_fmt0
+ " drop_through %d\n",
+ i, bb_jmp->from, bb_jmp->to, bb_jmp->drop_through);
+ }
+}
+
+/* Pass 2, record register changes in each basic block */
+
+/* For each opcode that we care about, indicate how it uses its operands. Most
+ * opcodes can be handled generically because they completely specify their
+ * operands in the instruction, however many opcodes have side effects such as
+ * reading or writing rax or updating rsp. Instructions that change registers
+ * that are not listed in the operands must be handled as special cases. In
+ * addition, instructions that copy registers while preserving their contents
+ * (push, pop, mov) or change the contents in a well defined way (add with an
+ * immediate, lea) must be handled as special cases in order to track the
+ * register contents.
+ *
+ * The tables below only list opcodes that are actually used in the Linux
+ * kernel, so they omit most of the floating point and all of the SSE type
+ * instructions. The operand usage entries only cater for accesses to memory
+ * and to the integer registers, accesses to floating point registers and flags
+ * are not relevant for kernel backtraces.
+ */
+
+enum bb_operand_usage {
+ BBOU_UNKNOWN = 0,
+ /* generic entries. because xchg can do any combinations of
+ * read src, write src, read dst and write dst we need to
+ * define all 16 possibilities. These are ordered by rs = 1,
+ * rd = 2, ws = 4, wd = 8, bb_usage_x*() functions rely on this
+ * order.
+ */
+ BBOU_RS = 1, /* read src */ /* 1 */
+ BBOU_RD, /* read dst */ /* 2 */
+ BBOU_RSRD, /* 3 */
+ BBOU_WS, /* write src */ /* 4 */
+ BBOU_RSWS, /* 5 */
+ BBOU_RDWS, /* 6 */
+ BBOU_RSRDWS, /* 7 */
+ BBOU_WD, /* write dst */ /* 8 */
+ BBOU_RSWD, /* 9 */
+ BBOU_RDWD, /* 10 */
+ BBOU_RSRDWD, /* 11 */
+ BBOU_WSWD, /* 12 */
+ BBOU_RSWSWD, /* 13 */
+ BBOU_RDWSWD, /* 14 */
+ BBOU_RSRDWSWD, /* 15 */
+ /* opcode specific entries */
+ BBOU_ADD,
+ BBOU_AND,
+ BBOU_CALL,
+ BBOU_CBW,
+ BBOU_CMOV,
+ BBOU_CMPXCHG,
+ BBOU_CMPXCHGD,
+ BBOU_CPUID,
+ BBOU_CWD,
+ BBOU_DIV,
+ BBOU_IDIV,
+ BBOU_IMUL,
+ BBOU_IRET,
+ BBOU_JMP,
+ BBOU_LAHF,
+ BBOU_LEA,
+ BBOU_LEAVE,
+ BBOU_LODS,
+ BBOU_LOOP,
+ BBOU_LSS,
+ BBOU_MONITOR,
+ BBOU_MOV,
+ BBOU_MOVS,
+ BBOU_MUL,
+ BBOU_MWAIT,
+ BBOU_NOP,
+ BBOU_OUTS,
+ BBOU_POP,
+ BBOU_POPF,
+ BBOU_PUSH,
+ BBOU_PUSHF,
+ BBOU_RDMSR,
+ BBOU_RDTSC,
+ BBOU_RET,
+ BBOU_SAHF,
+ BBOU_SCAS,
+ BBOU_SUB,
+ BBOU_SYSEXIT,
+ BBOU_SYSRET,
+ BBOU_WRMSR,
+ BBOU_XADD,
+ BBOU_XCHG,
+ BBOU_XOR,
+};
+
+struct bb_opcode_usage {
+ int length;
+ enum bb_operand_usage usage;
+ const char *opcode;
+};
+
+/* This table is sorted in alphabetical order of opcode, except that the
+ * trailing '"' is treated as a high value. For example, 'in' sorts after
+ * 'inc', 'bt' after 'btc'. This modified sort order ensures that shorter
+ * opcodes come after long ones. A normal sort would put 'in' first, so 'in'
+ * would match both 'inc' and 'in'. When adding any new entries to this table,
+ * be careful to put shorter entries last in their group.
+ *
+ * To automatically sort the table (in vi)
+ * Mark the first and last opcode line with 'a and 'b
+ * 'a
+ * !'bsed -e 's/"}/}}/' | LANG=C sort -t '"' -k2 | sed -e 's/}}/"}/'
+ *
+ * If a new instruction has to be added, first consider if it affects registers
+ * other than those listed in the operands. Also consider if you want to track
+ * the results of issuing the instruction, IOW can you extract useful
+ * information by looking in detail at the modified registers or memory. If
+ * either test is true then you need a special case to handle the instruction.
+ *
+ * The generic entries at the start of enum bb_operand_usage all have one thing
+ * in common, if a register or memory location is updated then that location
+ * becomes undefined, i.e. we lose track of anything that was previously saved
+ * in that location. So only use a generic BBOU_* value when the result of the
+ * instruction cannot be calculated exactly _and_ when all the affected
+ * registers are listed in the operands.
+ *
+ * Examples:
+ *
+ * 'call' does not generate a known result, but as a side effect of call,
+ * several scratch registers become undefined, so it needs a special BBOU_CALL
+ * entry.
+ *
+ * 'adc' generates a variable result, it depends on the carry flag, so 'adc'
+ * gets a generic entry. 'add' can generate an exact result (add with
+ * immediate on a register that points to the stack) or it can generate an
+ * unknown result (add a variable, or add immediate to a register that does not
+ * contain a stack pointer) so 'add' has its own BBOU_ADD entry.
+ */
+
+static const struct bb_opcode_usage
+bb_opcode_usage_all[] = {
+ {3, BBOU_RSRDWD, "adc"},
+ {3, BBOU_ADD, "add"},
+ {3, BBOU_AND, "and"},
+ {3, BBOU_RSWD, "bsf"},
+ {3, BBOU_RSWD, "bsr"},
+ {5, BBOU_RSWS, "bswap"},
+ {3, BBOU_RSRDWD, "btc"},
+ {3, BBOU_RSRDWD, "btr"},
+ {3, BBOU_RSRDWD, "bts"},
+ {2, BBOU_RSRD, "bt"},
+ {4, BBOU_CALL, "call"},
+ {4, BBOU_CBW, "cbtw"}, /* Intel cbw */
+ {3, BBOU_NOP, "clc"},
+ {3, BBOU_NOP, "cld"},
+ {7, BBOU_RS, "clflush"},
+ {4, BBOU_NOP, "clgi"},
+ {3, BBOU_NOP, "cli"},
+ {4, BBOU_CWD, "cltd"}, /* Intel cdq */
+ {4, BBOU_CBW, "cltq"}, /* Intel cdqe */
+ {4, BBOU_NOP, "clts"},
+ {4, BBOU_CMOV, "cmov"},
+ {9, BBOU_CMPXCHGD,"cmpxchg16"},
+ {8, BBOU_CMPXCHGD,"cmpxchg8"},
+ {7, BBOU_CMPXCHG, "cmpxchg"},
+ {3, BBOU_RSRD, "cmp"},
+ {5, BBOU_CPUID, "cpuid"},
+ {4, BBOU_CWD, "cqto"}, /* Intel cdo */
+ {4, BBOU_CWD, "cwtd"}, /* Intel cwd */
+ {4, BBOU_CBW, "cwtl"}, /* Intel cwde */
+ {4, BBOU_NOP, "data"}, /* alternative ASM_NOP<n> generates data16 on x86_64 */
+ {3, BBOU_RSWS, "dec"},
+ {3, BBOU_DIV, "div"},
+ {5, BBOU_RS, "fdivl"},
+ {5, BBOU_NOP, "finit"},
+ {6, BBOU_RS, "fistpl"},
+ {4, BBOU_RS, "fldl"},
+ {4, BBOU_RS, "fmul"},
+ {6, BBOU_NOP, "fnclex"},
+ {6, BBOU_NOP, "fninit"},
+ {6, BBOU_RS, "fnsave"},
+ {7, BBOU_NOP, "fnsetpm"},
+ {6, BBOU_RS, "frstor"},
+ {5, BBOU_WS, "fstsw"},
+ {5, BBOU_RS, "fsubp"},
+ {5, BBOU_NOP, "fwait"},
+ {7, BBOU_RS, "fxrstor"},
+ {6, BBOU_RS, "fxsave"},
+ {3, BBOU_NOP, "hlt"},
+ {4, BBOU_IDIV, "idiv"},
+ {4, BBOU_IMUL, "imul"},
+ {3, BBOU_RSWS, "inc"},
+ {3, BBOU_NOP, "int"},
+ {7, BBOU_RSRD, "invlpga"},
+ {6, BBOU_RS, "invlpg"},
+ {2, BBOU_RSWD, "in"},
+ {4, BBOU_IRET, "iret"},
+ {1, BBOU_JMP, "j"},
+ {4, BBOU_LAHF, "lahf"},
+ {3, BBOU_RSWD, "lar"},
+ {5, BBOU_RS, "lcall"},
+ {5, BBOU_LEAVE, "leave"},
+ {3, BBOU_LEA, "lea"},
+ {6, BBOU_NOP, "lfence"},
+ {4, BBOU_RS, "lgdt"},
+ {4, BBOU_RS, "lidt"},
+ {4, BBOU_RS, "ljmp"},
+ {4, BBOU_RS, "lldt"},
+ {4, BBOU_RS, "lmsw"},
+ {4, BBOU_LODS, "lods"},
+ {4, BBOU_LOOP, "loop"},
+ {4, BBOU_NOP, "lret"},
+ {3, BBOU_RSWD, "lsl"},
+ {3, BBOU_LSS, "lss"},
+ {3, BBOU_RS, "ltr"},
+ {6, BBOU_NOP, "mfence"},
+ {7, BBOU_MONITOR, "monitor"},
+ {4, BBOU_MOVS, "movs"},
+ {3, BBOU_MOV, "mov"},
+ {3, BBOU_MUL, "mul"},
+ {5, BBOU_MWAIT, "mwait"},
+ {3, BBOU_RSWS, "neg"},
+ {3, BBOU_NOP, "nop"},
+ {3, BBOU_RSWS, "not"},
+ {2, BBOU_RSRDWD, "or"},
+ {4, BBOU_OUTS, "outs"},
+ {3, BBOU_RSRD, "out"},
+ {5, BBOU_NOP, "pause"},
+ {4, BBOU_POPF, "popf"},
+ {3, BBOU_POP, "pop"},
+ {8, BBOU_RS, "prefetch"},
+ {5, BBOU_PUSHF, "pushf"},
+ {4, BBOU_PUSH, "push"},
+ {3, BBOU_RSRDWD, "rcl"},
+ {3, BBOU_RSRDWD, "rcr"},
+ {5, BBOU_RDMSR, "rdmsr"},
+ {5, BBOU_RDMSR, "rdpmc"}, /* same side effects as rdmsr */
+ {5, BBOU_RDTSC, "rdtsc"},
+ {3, BBOU_RET, "ret"},
+ {3, BBOU_RSRDWD, "rol"},
+ {3, BBOU_RSRDWD, "ror"},
+ {4, BBOU_SAHF, "sahf"},
+ {3, BBOU_RSRDWD, "sar"},
+ {3, BBOU_RSRDWD, "sbb"},
+ {4, BBOU_SCAS, "scas"},
+ {3, BBOU_WS, "set"},
+ {6, BBOU_NOP, "sfence"},
+ {4, BBOU_WS, "sgdt"},
+ {3, BBOU_RSRDWD, "shl"},
+ {3, BBOU_RSRDWD, "shr"},
+ {4, BBOU_WS, "sidt"},
+ {4, BBOU_WS, "sldt"},
+ {3, BBOU_NOP, "stc"},
+ {3, BBOU_NOP, "std"},
+ {4, BBOU_NOP, "stgi"},
+ {3, BBOU_NOP, "sti"},
+ {4, BBOU_SCAS, "stos"},
+ {4, BBOU_WS, "strl"},
+ {3, BBOU_WS, "str"},
+ {3, BBOU_SUB, "sub"},
+ {6, BBOU_NOP, "swapgs"},
+ {7, BBOU_SYSEXIT, "sysexit"},
+ {6, BBOU_SYSRET, "sysret"},
+ {4, BBOU_NOP, "test"},
+ {4, BBOU_NOP, "ud2a"},
+ {7, BBOU_RS, "vmclear"},
+ {8, BBOU_NOP, "vmlaunch"},
+ {6, BBOU_RS, "vmload"},
+ {7, BBOU_RS, "vmptrld"},
+ {6, BBOU_WD, "vmread"}, /* vmread src is an encoding, not a register */
+ {8, BBOU_NOP, "vmresume"},
+ {5, BBOU_RS, "vmrun"},
+ {6, BBOU_RS, "vmsave"},
+ {7, BBOU_WD, "vmwrite"}, /* vmwrite src is an encoding, not a register */
+ {3, BBOU_NOP, "vmxoff"},
+ {6, BBOU_NOP, "wbinvd"},
+ {5, BBOU_WRMSR, "wrmsr"},
+ {4, BBOU_XADD, "xadd"},
+ {4, BBOU_XCHG, "xchg"},
+ {3, BBOU_XOR, "xor"},
+ {4, BBOU_NOP, "xrstor"},
+ {4, BBOU_NOP, "xsave"},
+ {10, BBOU_WS, "xstore-rng"},
+};
+
+/* To speed up searching, index bb_opcode_usage_all by the first letter of each
+ * opcode.
+ */
+static struct {
+ const struct bb_opcode_usage *opcode;
+ int size;
+} bb_opcode_usage[26];
+
+struct bb_operand {
+ char *base;
+ char *index;
+ char *segment;
+ long disp;
+ unsigned int scale;
+ enum bb_reg_code base_rc; /* UNDEFINED or RAX through R15 */
+ enum bb_reg_code index_rc; /* UNDEFINED or RAX through R15 */
+ unsigned int present :1;
+ unsigned int disp_present :1;
+ unsigned int indirect :1; /* must be combined with reg or memory */
+ unsigned int immediate :1; /* exactly one of these 3 must be set */
+ unsigned int reg :1;
+ unsigned int memory :1;
+};
+
+struct bb_decode {
+ char *prefix;
+ char *opcode;
+ const struct bb_opcode_usage *match;
+ struct bb_operand src;
+ struct bb_operand dst;
+ struct bb_operand dst2;
+};
+
+static struct bb_decode bb_decode;
+
+static enum bb_reg_code
+bb_reg_map(const char *reg)
+{
+ int lo, hi, c;
+ const struct bb_reg_code_map *p;
+ lo = 0;
+ hi = ARRAY_SIZE(bb_reg_code_map) - 1;
+ while (lo <= hi) {
+ int mid = (hi + lo) / 2;
+ p = bb_reg_code_map + mid;
+ c = strcmp(p->name, reg+1);
+ if (c == 0)
+ return p->reg;
+ else if (c > 0)
+ hi = mid - 1;
+ else
+ lo = mid + 1;
+ }
+ return BBRG_UNDEFINED;
+}
+
+static void
+bb_parse_operand(char *str, struct bb_operand *operand)
+{
+ char *p = str;
+ int sign = 1;
+ operand->present = 1;
+ /* extract any segment prefix */
+ if (p[0] == '%' && p[1] && p[2] == 's' && p[3] == ':') {
+ operand->memory = 1;
+ operand->segment = p;
+ p[3] = '\0';
+ p += 4;
+ }
+ /* extract displacement, base, index, scale */
+ if (*p == '*') {
+ /* jmp/call *disp(%reg), *%reg or *0xnnn */
+ operand->indirect = 1;
+ ++p;
+ }
+ if (*p == '-') {
+ sign = -1;
+ ++p;
+ }
+ if (*p == '$') {
+ operand->immediate = 1;
+ operand->disp_present = 1;
+ operand->disp = simple_strtoul(p+1, &p, 0);
+ } else if (isdigit(*p)) {
+ operand->memory = 1;
+ operand->disp_present = 1;
+ operand->disp = simple_strtoul(p, &p, 0) * sign;
+ }
+ if (*p == '%') {
+ operand->reg = 1;
+ operand->base = p;
+ } else if (*p == '(') {
+ operand->memory = 1;
+ operand->base = ++p;
+ p += strcspn(p, ",)");
+ if (p == operand->base)
+ operand->base = NULL;
+ if (*p == ',') {
+ *p = '\0';
+ operand->index = ++p;
+ p += strcspn(p, ",)");
+ if (p == operand->index)
+ operand->index = NULL;
+ }
+ if (*p == ',') {
+ *p = '\0';
+ operand->scale = simple_strtoul(p+1, &p, 0);
+ }
+ *p = '\0';
+ } else if (*p) {
+ kdb_printf("%s: unexpected token '%c' after disp '%s'\n",
+ __FUNCTION__, *p, str);
+ bb_giveup = 1;
+ }
+ if ((operand->immediate + operand->reg + operand->memory != 1) ||
+ (operand->indirect && operand->immediate)) {
+ kdb_printf("%s: incorrect decode '%s' N %d I %d R %d M %d\n",
+ __FUNCTION__, str,
+ operand->indirect, operand->immediate, operand->reg,
+ operand->memory);
+ bb_giveup = 1;
+ }
+ if (operand->base)
+ operand->base_rc = bb_reg_map(operand->base);
+ if (operand->index)
+ operand->index_rc = bb_reg_map(operand->index);
+}
+
+static void
+bb_print_operand(const char *type, const struct bb_operand *operand)
+{
+ if (!operand->present)
+ return;
+ kdb_printf(" %s %c%c: ",
+ type,
+ operand->indirect ? 'N' : ' ',
+ operand->immediate ? 'I' :
+ operand->reg ? 'R' :
+ operand->memory ? 'M' :
+ '?'
+ );
+ if (operand->segment)
+ kdb_printf("%s:", operand->segment);
+ if (operand->immediate) {
+ kdb_printf("$0x%lx", operand->disp);
+ } else if (operand->reg) {
+ if (operand->indirect)
+ kdb_printf("*");
+ kdb_printf("%s", operand->base);
+ } else if (operand->memory) {
+ if (operand->indirect && (operand->base || operand->index))
+ kdb_printf("*");
+ if (operand->disp_present) {
+ kdb_printf("0x%lx", operand->disp);
+ }
+ if (operand->base || operand->index || operand->scale) {
+ kdb_printf("(");
+ if (operand->base)
+ kdb_printf("%s", operand->base);
+ if (operand->index || operand->scale)
+ kdb_printf(",");
+ if (operand->index)
+ kdb_printf("%s", operand->index);
+ if (operand->scale)
+ kdb_printf(",%d", operand->scale);
+ kdb_printf(")");
+ }
+ }
+ if (operand->base_rc)
+ kdb_printf(" base_rc %d (%s)",
+ operand->base_rc, bbrg_name[operand->base_rc]);
+ if (operand->index_rc)
+ kdb_printf(" index_rc %d (%s)",
+ operand->index_rc,
+ bbrg_name[operand->index_rc]);
+ kdb_printf("\n");
+}
+
+static void
+bb_print_opcode(void)
+{
+ const struct bb_opcode_usage *o = bb_decode.match;
+ kdb_printf(" ");
+ if (bb_decode.prefix)
+ kdb_printf("%s ", bb_decode.prefix);
+ kdb_printf("opcode '%s' matched by '%s', usage %d\n",
+ bb_decode.opcode, o->opcode, o->usage);
+}
+
+static int
+bb_parse_opcode(void)
+{
+ int c, i;
+ const struct bb_opcode_usage *o;
+ static int bb_parse_opcode_error_limit = 5;
+ c = bb_decode.opcode[0] - 'a';
+ if (c < 0 || c >= ARRAY_SIZE(bb_opcode_usage))
+ goto nomatch;
+ o = bb_opcode_usage[c].opcode;
+ if (!o)
+ goto nomatch;
+ for (i = 0; i < bb_opcode_usage[c].size; ++i, ++o) {
+ if (strncmp(bb_decode.opcode, o->opcode, o->length) == 0) {
+ bb_decode.match = o;
+ if (KDB_DEBUG(BB))
+ bb_print_opcode();
+ return 0;
+ }
+ }
+nomatch:
+ if (!bb_parse_opcode_error_limit)
+ return 1;
+ --bb_parse_opcode_error_limit;
+ kdb_printf("%s: no match at [%s]%s " kdb_bfd_vma_fmt0 " - '%s'\n",
+ __FUNCTION__,
+ bb_mod_name, bb_func_name, bb_curr_addr,
+ bb_decode.opcode);
+ return 1;
+}
+
+static bool
+bb_is_int_reg(enum bb_reg_code reg)
+{
+ return reg >= BBRG_RAX && reg < (BBRG_RAX + KDB_INT_REGISTERS);
+}
+
+static bool
+bb_is_simple_memory(const struct bb_operand *operand)
+{
+ return operand->memory &&
+ bb_is_int_reg(operand->base_rc) &&
+ !operand->index_rc &&
+ operand->scale == 0 &&
+ !operand->segment;
+}
+
+static bool
+bb_is_static_disp(const struct bb_operand *operand)
+{
+ return operand->memory &&
+ !operand->base_rc &&
+ !operand->index_rc &&
+ operand->scale == 0 &&
+ !operand->segment &&
+ !operand->indirect;
+}
+
+static enum bb_reg_code
+bb_reg_code_value(enum bb_reg_code reg)
+{
+ BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ return bb_reg_state->contains[reg - BBRG_RAX].value;
+}
+
+static short
+bb_reg_code_offset(enum bb_reg_code reg)
+{
+ BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ return bb_reg_state->contains[reg - BBRG_RAX].offset;
+}
+
+static void
+bb_reg_code_set_value(enum bb_reg_code dst, enum bb_reg_code src)
+{
+ BB_CHECK(!bb_is_int_reg(dst), dst, );
+ bb_reg_state->contains[dst - BBRG_RAX].value = src;
+}
+
+static void
+bb_reg_code_set_offset(enum bb_reg_code dst, short offset)
+{
+ BB_CHECK(!bb_is_int_reg(dst), dst, );
+ bb_reg_state->contains[dst - BBRG_RAX].offset = offset;
+}
+
+static bool
+bb_is_osp_defined(enum bb_reg_code reg)
+{
+ if (bb_is_int_reg(reg))
+ return bb_reg_code_value(reg) == BBRG_OSP;
+ else
+ return 0;
+}
+
+static bfd_vma
+bb_actual_value(enum bb_reg_code reg)
+{
+ BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ return bb_actual[reg - BBRG_RAX].value;
+}
+
+static int
+bb_actual_valid(enum bb_reg_code reg)
+{
+ BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ return bb_actual[reg - BBRG_RAX].valid;
+}
+
+static void
+bb_actual_set_value(enum bb_reg_code reg, bfd_vma value)
+{
+ BB_CHECK(!bb_is_int_reg(reg), reg, );
+ bb_actual[reg - BBRG_RAX].value = value;
+}
+
+static void
+bb_actual_set_valid(enum bb_reg_code reg, int valid)
+{
+ BB_CHECK(!bb_is_int_reg(reg), reg, );
+ bb_actual[reg - BBRG_RAX].valid = valid;
+}
+
+/* The scheduler code switches RSP then does PUSH, it is not an error for RSP
+ * to be undefined in this area of the code.
+ */
+static bool
+bb_is_scheduler_address(void)
+{
+ return bb_curr_addr >= bb__sched_text_start &&
+ bb_curr_addr < bb__sched_text_end;
+}
+
+static void
+bb_reg_read(enum bb_reg_code reg)
+{
+ int i, r = 0;
+ if (!bb_is_int_reg(reg) ||
+ bb_reg_code_value(reg) != reg)
+ return;
+ for (i = 0;
+ i < min_t(unsigned int, REGPARM, ARRAY_SIZE(bb_param_reg));
+ ++i) {
+ if (reg == bb_param_reg[i]) {
+ r = i + 1;
+ break;
+ }
+ }
+ bb_reg_params = max(bb_reg_params, r);
+}
+
+static void
+bb_do_reg_state_print(const struct bb_reg_state *s)
+{
+ int i, offset_address, offset_value;
+ const struct bb_memory_contains *c;
+ enum bb_reg_code value;
+ kdb_printf(" bb_reg_state %p\n", s);
+ for (i = 0; i < ARRAY_SIZE(s->contains); ++i) {
+ value = s->contains[i].value;
+ offset_value = s->contains[i].offset;
+ kdb_printf(" %s = %s",
+ bbrg_name[i + BBRG_RAX], bbrg_name[value]);
+ if (value == BBRG_OSP)
+ KDB_DEBUG_BB_OFFSET_PRINTF(offset_value, "", "");
+ kdb_printf("\n");
+ }
+ for (i = 0, c = s->memory; i < s->mem_count; ++i, ++c) {
+ offset_address = c->offset_address;
+ value = c->value;
+ offset_value = c->offset_value;
+ kdb_printf(" slot %d offset_address %c0x%x %s",
+ i,
+ offset_address >= 0 ? '+' : '-',
+ offset_address >= 0 ? offset_address : -offset_address,
+ bbrg_name[value]);
+ if (value == BBRG_OSP)
+ KDB_DEBUG_BB_OFFSET_PRINTF(offset_value, "", "");
+ kdb_printf("\n");
+ }
+}
+
+static void
+bb_reg_state_print(const struct bb_reg_state *s)
+{
+ if (KDB_DEBUG(BB))
+ bb_do_reg_state_print(s);
+}
+
+/* Set register 'dst' to contain the value from 'src'. This includes reading
+ * from 'src' and writing to 'dst'. The offset value is copied iff 'src'
+ * contains a stack pointer.
+ *
+ * Be very careful about the context here. 'dst' and 'src' reflect integer
+ * registers by name, _not_ by the value of their contents. "mov %rax,%rsi"
+ * will call this function as bb_reg_set_reg(BBRG_RSI, BBRG_RAX), which
+ * reflects what the assembler code is doing. However we need to track the
+ * _values_ in the registers, not their names. IOW, we really care about "what
+ * value does rax contain when it is copied into rsi?", so we can record the
+ * fact that we now have two copies of that value, one in rax and one in rsi.
+ */
+
+static void
+bb_reg_set_reg(enum bb_reg_code dst, enum bb_reg_code src)
+{
+ enum bb_reg_code src_value = BBRG_UNDEFINED;
+ short offset_value = 0;
+ KDB_DEBUG_BB(" %s = %s", bbrg_name[dst], bbrg_name[src]);
+ if (bb_is_int_reg(src)) {
+ bb_reg_read(src);
+ src_value = bb_reg_code_value(src);
+ KDB_DEBUG_BB(" (%s", bbrg_name[src_value]);
+ if (bb_is_osp_defined(src)) {
+ offset_value = bb_reg_code_offset(src);
+ KDB_DEBUG_BB_OFFSET(offset_value, "", "");
+ }
+ KDB_DEBUG_BB(")");
+ }
+ if (bb_is_int_reg(dst)) {
+ bb_reg_code_set_value(dst, src_value);
+ bb_reg_code_set_offset(dst, offset_value);
+ }
+ KDB_DEBUG_BB("\n");
+}
+
+static void
+bb_reg_set_undef(enum bb_reg_code dst)
+{
+ bb_reg_set_reg(dst, BBRG_UNDEFINED);
+}
+
+/* Delete any record of a stored register held in osp + 'offset' */
+
+static void
+bb_delete_memory(short offset)
+{
+ int i;
+ struct bb_memory_contains *c;
+ for (i = 0, c = bb_reg_state->memory;
+ i < bb_reg_state->mem_count;
+ ++i, ++c) {
+ if (c->offset_address == offset &&
+ c->value != BBRG_UNDEFINED) {
+ KDB_DEBUG_BB(" delete %s from ",
+ bbrg_name[c->value]);
+ KDB_DEBUG_BB_OFFSET(offset, "osp", "");
+ KDB_DEBUG_BB(" slot %d\n",
+ (int)(c - bb_reg_state->memory));
+ memset(c, BBRG_UNDEFINED, sizeof(*c));
+ if (i == bb_reg_state->mem_count - 1)
+ --bb_reg_state->mem_count;
+ }
+ }
+}
+
+/* Set memory location *('dst' + 'offset_address') to contain the supplied
+ * value and offset. 'dst' is assumed to be a register that contains a stack
+ * pointer.
+ */
+
+static void
+bb_memory_set_reg_value(enum bb_reg_code dst, short offset_address,
+ enum bb_reg_code value, short offset_value)
+{
+ int i;
+ struct bb_memory_contains *c, *free = NULL;
+ BB_CHECK(!bb_is_osp_defined(dst), dst, );
+ KDB_DEBUG_BB(" *(%s", bbrg_name[dst]);
+ KDB_DEBUG_BB_OFFSET(offset_address, "", "");
+ offset_address += bb_reg_code_offset(dst);
+ KDB_DEBUG_BB_OFFSET(offset_address, " osp", ") = ");
+ KDB_DEBUG_BB("%s", bbrg_name[value]);
+ if (value == BBRG_OSP)
+ KDB_DEBUG_BB_OFFSET(offset_value, "", "");
+ for (i = 0, c = bb_reg_state->memory;
+ i < bb_reg_state_max;
+ ++i, ++c) {
+ if (c->offset_address == offset_address)
+ free = c;
+ else if (c->value == BBRG_UNDEFINED && !free)
+ free = c;
+ }
+ if (!free) {
+ struct bb_reg_state *new, *old = bb_reg_state;
+ size_t old_size, new_size;
+ int slot;
+ old_size = sizeof(*old) + bb_reg_state_max *
+ sizeof(old->memory[0]);
+ slot = bb_reg_state_max;
+ bb_reg_state_max += 5;
+ new_size = sizeof(*new) + bb_reg_state_max *
+ sizeof(new->memory[0]);
+ new = debug_kmalloc(new_size, GFP_ATOMIC);
+ if (!new) {
+ kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ bb_giveup = 1;
+ } else {
+ memcpy(new, old, old_size);
+ memset((char *)new + old_size, BBRG_UNDEFINED,
+ new_size - old_size);
+ bb_reg_state = new;
+ debug_kfree(old);
+ free = bb_reg_state->memory + slot;
+ }
+ }
+ if (free) {
+ int slot = free - bb_reg_state->memory;
+ free->offset_address = offset_address;
+ free->value = value;
+ free->offset_value = offset_value;
+ KDB_DEBUG_BB(" slot %d", slot);
+ bb_reg_state->mem_count = max(bb_reg_state->mem_count, slot+1);
+ }
+ KDB_DEBUG_BB("\n");
+}
+
+/* Set memory location *('dst' + 'offset') to contain the value from register
+ * 'src'. 'dst' is assumed to be a register that contains a stack pointer.
+ * This differs from bb_memory_set_reg_value because it takes a src register
+ * which contains a value and possibly an offset, bb_memory_set_reg_value is
+ * passed the value and offset directly.
+ */
+
+static void
+bb_memory_set_reg(enum bb_reg_code dst, enum bb_reg_code src,
+ short offset_address)
+{
+ int offset_value;
+ enum bb_reg_code value;
+ BB_CHECK(!bb_is_osp_defined(dst), dst, );
+ if (!bb_is_int_reg(src))
+ return;
+ value = bb_reg_code_value(src);
+ if (value == BBRG_UNDEFINED) {
+ bb_delete_memory(offset_address + bb_reg_code_offset(dst));
+ return;
+ }
+ offset_value = bb_reg_code_offset(src);
+ bb_reg_read(src);
+ bb_memory_set_reg_value(dst, offset_address, value, offset_value);
+}
+
+/* Set register 'dst' to contain the value from memory *('src' + offset_address).
+ * 'src' is assumed to be a register that contains a stack pointer.
+ */
+
+static void
+bb_reg_set_memory(enum bb_reg_code dst, enum bb_reg_code src, short offset_address)
+{
+ int i, defined = 0;
+ struct bb_memory_contains *s;
+ BB_CHECK(!bb_is_osp_defined(src), src, );
+ KDB_DEBUG_BB(" %s = *(%s",
+ bbrg_name[dst], bbrg_name[src]);
+ KDB_DEBUG_BB_OFFSET(offset_address, "", ")");
+ offset_address += bb_reg_code_offset(src);
+ KDB_DEBUG_BB_OFFSET(offset_address, " (osp", ")");
+ for (i = 0, s = bb_reg_state->memory;
+ i < bb_reg_state->mem_count;
+ ++i, ++s) {
+ if (s->offset_address == offset_address && bb_is_int_reg(dst)) {
+ bb_reg_code_set_value(dst, s->value);
+ KDB_DEBUG_BB(" value %s", bbrg_name[s->value]);
+ if (s->value == BBRG_OSP) {
+ bb_reg_code_set_offset(dst, s->offset_value);
+ KDB_DEBUG_BB_OFFSET(s->offset_value, "", "");
+ } else {
+ bb_reg_code_set_offset(dst, 0);
+ }
+ defined = 1;
+ }
+ }
+ if (!defined)
+ bb_reg_set_reg(dst, BBRG_UNDEFINED);
+ else
+ KDB_DEBUG_BB("\n");
+}
+
+/* A generic read from an operand. */
+
+static void
+bb_read_operand(const struct bb_operand *operand)
+{
+ int m = 0;
+ if (operand->base_rc)
+ bb_reg_read(operand->base_rc);
+ if (operand->index_rc)
+ bb_reg_read(operand->index_rc);
+ if (bb_is_simple_memory(operand) &&
+ bb_is_osp_defined(operand->base_rc) &&
+ bb_decode.match->usage != BBOU_LEA) {
+ m = (bb_reg_code_offset(operand->base_rc) + operand->disp +
+ KDB_WORD_SIZE - 1) / KDB_WORD_SIZE;
+ bb_memory_params = max(bb_memory_params, m);
+ }
+}
+
+/* A generic write to an operand, resulting in an undefined value in that
+ * location. All well defined operands are handled separately, this function
+ * only handles the opcodes where the result is undefined.
+ */
+
+static void
+bb_write_operand(const struct bb_operand *operand)
+{
+ enum bb_reg_code base_rc = operand->base_rc;
+ if (operand->memory) {
+ if (base_rc)
+ bb_reg_read(base_rc);
+ if (operand->index_rc)
+ bb_reg_read(operand->index_rc);
+ } else if (operand->reg && base_rc) {
+ bb_reg_set_undef(base_rc);
+ }
+ if (bb_is_simple_memory(operand) && bb_is_osp_defined(base_rc)) {
+ int offset;
+ offset = bb_reg_code_offset(base_rc) + operand->disp;
+ offset = ALIGN(offset - KDB_WORD_SIZE + 1, KDB_WORD_SIZE);
+ bb_delete_memory(offset);
+ }
+}
+
+/* Adjust a register that contains a stack pointer */
+
+static void
+bb_adjust_osp(enum bb_reg_code reg, int adjust)
+{
+ int offset = bb_reg_code_offset(reg), old_offset = offset;
+ KDB_DEBUG_BB(" %s osp offset ", bbrg_name[reg]);
+ KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(reg), "", " -> ");
+ offset += adjust;
+ bb_reg_code_set_offset(reg, offset);
+ KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(reg), "", "\n");
+ /* When RSP is adjusted upwards, it invalidates any memory
+ * stored between the old and current stack offsets.
+ */
+ if (reg == BBRG_RSP) {
+ while (old_offset < bb_reg_code_offset(reg)) {
+ bb_delete_memory(old_offset);
+ old_offset += KDB_WORD_SIZE;
+ }
+ }
+}
+
+/* The current instruction adjusts a register that contains a stack pointer.
+ * Direction is 1 or -1, depending on whether the instruction is add/lea or
+ * sub.
+ */
+
+static void
+bb_adjust_osp_instruction(int direction)
+{
+ enum bb_reg_code dst_reg = bb_decode.dst.base_rc;
+ if (bb_decode.src.immediate ||
+ bb_decode.match->usage == BBOU_LEA /* lea has its own checks */) {
+ int adjust = direction * bb_decode.src.disp;
+ bb_adjust_osp(dst_reg, adjust);
+ } else {
+ /* variable stack adjustment, osp offset is not well defined */
+ KDB_DEBUG_BB(" %s osp offset ", bbrg_name[dst_reg]);
+ KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(dst_reg), "", " -> undefined\n");
+ bb_reg_code_set_value(dst_reg, BBRG_UNDEFINED);
+ bb_reg_code_set_offset(dst_reg, 0);
+ }
+}
+
+/* Some instructions using memory have an explicit length suffix (b, w, l, q).
+ * The equivalent instructions using a register imply the length from the
+ * register name. Deduce the operand length.
+ */
+
+static int
+bb_operand_length(const struct bb_operand *operand, char opcode_suffix)
+{
+ int l = 0;
+ switch (opcode_suffix) {
+ case 'b':
+ l = 8;
+ break;
+ case 'w':
+ l = 16;
+ break;
+ case 'l':
+ l = 32;
+ break;
+ case 'q':
+ l = 64;
+ break;
+ }
+ if (l == 0 && operand->reg) {
+ switch (strlen(operand->base)) {
+ case 3:
+ switch (operand->base[2]) {
+ case 'h':
+ case 'l':
+ l = 8;
+ break;
+ default:
+ l = 16;
+ break;
+ }
+ case 4:
+ if (operand->base[1] == 'r')
+ l = 64;
+ else
+ l = 32;
+ break;
+ }
+ }
+ return l;
+}
+
+static int
+bb_reg_state_size(const struct bb_reg_state *state)
+{
+ return sizeof(*state) +
+ state->mem_count * sizeof(state->memory[0]);
+}
+
+/* Canonicalize the current bb_reg_state so it can be compared against
+ * previously created states. Sort the memory entries in descending order of
+ * offset_address (stack grows down). Empty slots are moved to the end of the
+ * list and trimmed.
+ */
+
+static void
+bb_reg_state_canonicalize(void)
+{
+ int i, order, changed;
+ struct bb_memory_contains *p1, *p2, temp;
+ do {
+ changed = 0;
+ for (i = 0, p1 = bb_reg_state->memory;
+ i < bb_reg_state->mem_count-1;
+ ++i, ++p1) {
+ p2 = p1 + 1;
+ if (p2->value == BBRG_UNDEFINED) {
+ order = 0;
+ } else if (p1->value == BBRG_UNDEFINED) {
+ order = 1;
+ } else if (p1->offset_address < p2->offset_address) {
+ order = 1;
+ } else if (p1->offset_address > p2->offset_address) {
+ order = -1;
+ } else {
+ order = 0;
+ }
+ if (order > 0) {
+ temp = *p2;
+ *p2 = *p1;
+ *p1 = temp;
+ changed = 1;
+ }
+ }
+ } while(changed);
+ for (i = 0, p1 = bb_reg_state->memory;
+ i < bb_reg_state_max;
+ ++i, ++p1) {
+ if (p1->value != BBRG_UNDEFINED)
+ bb_reg_state->mem_count = i + 1;
+ }
+ bb_reg_state_print(bb_reg_state);
+}
+
+static int
+bb_special_case(bfd_vma to)
+{
+ int i, j, rsp_offset, expect_offset, offset, errors = 0, max_errors = 40;
+ enum bb_reg_code reg, expect_value, value;
+ struct bb_name_state *r;
+
+ for (i = 0, r = bb_special_cases;
+ i < ARRAY_SIZE(bb_special_cases);
+ ++i, ++r) {
+ if (to == r->address &&
+ (r->fname == NULL || strcmp(bb_func_name, r->fname) == 0))
+ goto match;
+ }
+ /* Some inline assembler code has jumps to .fixup sections which result
+ * in out of line transfers with undefined state, ignore them.
+ */
+ if (strcmp(bb_func_name, "strnlen_user") == 0 ||
+ strcmp(bb_func_name, "copy_from_user") == 0)
+ return 1;
+ return 0;
+
+match:
+ /* Check the running registers match */
+ for (reg = BBRG_RAX; reg < r->regs_size; ++reg) {
+ expect_value = r->regs[reg].value;
+ if (test_bit(expect_value, r->skip_regs.bits)) {
+ /* this regs entry is not defined for this label */
+ continue;
+ }
+ if (expect_value == BBRG_UNDEFINED)
+ continue;
+ expect_offset = r->regs[reg].offset;
+ value = bb_reg_code_value(reg);
+ offset = bb_reg_code_offset(reg);
+ if (expect_value == value &&
+ (value != BBRG_OSP || r->osp_offset == offset))
+ continue;
+ kdb_printf("%s: Expected %s to contain %s",
+ __FUNCTION__,
+ bbrg_name[reg],
+ bbrg_name[expect_value]);
+ if (r->osp_offset)
+ KDB_DEBUG_BB_OFFSET_PRINTF(r->osp_offset, "", "");
+ kdb_printf(". It actually contains %s", bbrg_name[value]);
+ if (offset)
+ KDB_DEBUG_BB_OFFSET_PRINTF(offset, "", "");
+ kdb_printf("\n");
+ ++errors;
+ if (max_errors-- == 0)
+ goto fail;
+ }
+ /* Check that any memory data on stack matches */
+ i = j = 0;
+ while (i < bb_reg_state->mem_count &&
+ j < r->mem_size) {
+ expect_value = r->mem[j].value;
+ if (test_bit(expect_value, r->skip_mem.bits) ||
+ expect_value == BBRG_UNDEFINED) {
+ /* this memory slot is not defined for this label */
+ ++j;
+ continue;
+ }
+ rsp_offset = bb_reg_state->memory[i].offset_address -
+ bb_reg_code_offset(BBRG_RSP);
+ if (rsp_offset >
+ r->mem[j].offset_address) {
+ /* extra slots in memory are OK */
+ ++i;
+ } else if (rsp_offset <
+ r->mem[j].offset_address) {
+ /* Required memory slot is missing */
+ kdb_printf("%s: Invalid bb_reg_state.memory, "
+ "missing memory entry[%d] %s\n",
+ __FUNCTION__, j, bbrg_name[expect_value]);
+ ++errors;
+ if (max_errors-- == 0)
+ goto fail;
+ ++j;
+ } else {
+ if (bb_reg_state->memory[i].offset_value ||
+ bb_reg_state->memory[i].value != expect_value) {
+ /* memory slot is present but contains wrong
+ * value.
+ */
+ kdb_printf("%s: Invalid bb_reg_state.memory, "
+ "wrong value in slot %d, "
+ "should be %s, it is %s\n",
+ __FUNCTION__, i,
+ bbrg_name[expect_value],
+ bbrg_name[bb_reg_state->memory[i].value]);
+ ++errors;
+ if (max_errors-- == 0)
+ goto fail;
+ }
+ ++i;
+ ++j;
+ }
+ }
+ while (j < r->mem_size) {
+ expect_value = r->mem[j].value;
+ if (test_bit(expect_value, r->skip_mem.bits) ||
+ expect_value == BBRG_UNDEFINED)
+ ++j;
+ else
+ break;
+ }
+ if (j != r->mem_size) {
+ /* Hit end of memory before testing all the pt_reg slots */
+ kdb_printf("%s: Invalid bb_reg_state.memory, "
+ "missing trailing entries\n",
+ __FUNCTION__);
+ ++errors;
+ if (max_errors-- == 0)
+ goto fail;
+ }
+ if (errors)
+ goto fail;
+ return 1;
+fail:
+ kdb_printf("%s: on transfer to %s\n", __FUNCTION__, r->name);
+ bb_giveup = 1;
+ return 1;
+}
+
+/* Transfer of control to a label outside the current function. If the
+ * transfer is to a known common code path then do a sanity check on the state
+ * at this point.
+ */
+
+static void
+bb_sanity_check(int type)
+{
+ enum bb_reg_code expect, actual;
+ int i, offset, error = 0;
+
+ for (i = 0; i < ARRAY_SIZE(bb_preserved_reg); ++i) {
+ expect = bb_preserved_reg[i];
+ actual = bb_reg_code_value(expect);
+ offset = bb_reg_code_offset(expect);
+ if (expect == actual)
+ continue;
+ /* type == 1 is sysret/sysexit, ignore RSP */
+ if (type && expect == BBRG_RSP)
+ continue;
+ /* type == 1 is sysret/sysexit, ignore RBP for i386 */
+ /* We used to have "#ifndef CONFIG_X86_64" for the type=1 RBP
+ * test; however, x86_64 can run ia32 compatible mode and
+ * hit this problem. Perform the following test anyway!
+ */
+ if (type && expect == BBRG_RBP)
+ continue;
+ /* RSP should contain OSP+0. Except for ptregscall_common and
+ * ia32_ptregs_common, they get a partial pt_regs, fudge the
+ * stack to make it a full pt_regs then reverse the effect on
+ * exit, so the offset is -0x50 on exit.
+ */
+ if (expect == BBRG_RSP &&
+ bb_is_osp_defined(expect) &&
+ (offset == 0 ||
+ (offset == -0x50 &&
+ (strcmp(bb_func_name, "ptregscall_common") == 0 ||
+ strcmp(bb_func_name, "ia32_ptregs_common") == 0))))
+ continue;
+ /* The put_user and save_paranoid functions are special.
+ * %rbx gets clobbered */
+ if (expect == BBRG_RBX &&
+ (strncmp(bb_func_name, "__put_user_", 11) == 0 ||
+ strcmp(bb_func_name, "save_paranoid") == 0))
+ continue;
+ /* Ignore rbp and rsp for error_entry */
+ if ((strcmp(bb_func_name, "error_entry") == 0) &&
+ (expect == BBRG_RBX ||
+ (expect == BBRG_RSP && bb_is_osp_defined(expect) && offset == -0x10)))
+ continue;
+ kdb_printf("%s: Expected %s, got %s",
+ __FUNCTION__,
+ bbrg_name[expect], bbrg_name[actual]);
+ if (offset)
+ KDB_DEBUG_BB_OFFSET_PRINTF(offset, "", "");
+ kdb_printf("\n");
+ error = 1;
+ }
+ BB_CHECK(error, error, );
+}
+
+/* Transfer of control. Follow the arc and save the current state as input to
+ * another basic block.
+ */
+
+static void
+bb_transfer(bfd_vma from, bfd_vma to, unsigned int drop_through)
+{
+ int i, found;
+ size_t size;
+ struct bb* bb = NULL; /*stupid gcc */
+ struct bb_jmp *bb_jmp;
+ struct bb_reg_state *state;
+ bb_reg_state_canonicalize();
+ found = 0;
+ for (i = 0; i < bb_jmp_count; ++i) {
+ bb_jmp = bb_jmp_list + i;
+ if (bb_jmp->from == from &&
+ bb_jmp->to == to &&
+ bb_jmp->drop_through == drop_through) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ /* Transfer outside the current function. Check the special
+ * cases (mainly in entry.S) first. If it is not a known
+ * special case then check if the target address is the start
+ * of a function or not. If it is the start of a function then
+ * assume tail recursion and require that the state be the same
+ * as on entry. Otherwise assume out of line code (e.g.
+ * spinlock contention path) and ignore it, the state can be
+ * anything.
+ */
+ kdb_symtab_t symtab;
+ if (bb_special_case(to))
+ return;
+ kdbnearsym(to, &symtab);
+ if (symtab.sym_start != to)
+ return;
+ bb_sanity_check(0);
+ if (bb_giveup)
+ return;
+#ifdef NO_SIBLINGS
+ /* Only print this message when the kernel is compiled with
+ * -fno-optimize-sibling-calls. Otherwise it would print a
+ * message for every tail recursion call. If you see the
+ * message below then you probably have an assembler label that
+ * is not listed in the special cases.
+ */
+ kdb_printf(" not matched: from "
+ kdb_bfd_vma_fmt0
+ " to " kdb_bfd_vma_fmt0
+ " drop_through %d bb_jmp[%d]\n",
+ from, to, drop_through, i);
+#endif /* NO_SIBLINGS */
+ return;
+ }
+ KDB_DEBUG_BB(" matched: from " kdb_bfd_vma_fmt0
+ " to " kdb_bfd_vma_fmt0
+ " drop_through %d bb_jmp[%d]\n",
+ from, to, drop_through, i);
+ found = 0;
+ for (i = 0; i < bb_count; ++i) {
+ bb = bb_list[i];
+ if (bb->start == to) {
+ found = 1;
+ break;
+ }
+ }
+ BB_CHECK(!found, to, );
+ /* If the register state for this arc has already been set (we are
+ * rescanning the block that originates the arc) and the state is the
+ * same as the previous state for this arc then this input to the
+ * target block is the same as last time, so there is no need to rescan
+ * the target block.
+ */
+ state = bb_jmp->state;
+ size = bb_reg_state_size(bb_reg_state);
+ if (state) {
+ bb_reg_state->ref_count = state->ref_count;
+ if (memcmp(state, bb_reg_state, size) == 0) {
+ KDB_DEBUG_BB(" no state change\n");
+ return;
+ }
+ if (--state->ref_count == 0)
+ debug_kfree(state);
+ bb_jmp->state = NULL;
+ }
+ /* New input state is required. To save space, check if any other arcs
+ * have the same state and reuse them where possible. The overall set
+ * of inputs to the target block is now different so the target block
+ * must be rescanned.
+ */
+ bb->changed = 1;
+ for (i = 0; i < bb_jmp_count; ++i) {
+ state = bb_jmp_list[i].state;
+ if (!state)
+ continue;
+ bb_reg_state->ref_count = state->ref_count;
+ if (memcmp(state, bb_reg_state, size) == 0) {
+ KDB_DEBUG_BB(" reuse bb_jmp[%d]\n", i);
+ bb_jmp->state = state;
+ ++state->ref_count;
+ return;
+ }
+ }
+ state = debug_kmalloc(size, GFP_ATOMIC);
+ if (!state) {
+ kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ bb_giveup = 1;
+ return;
+ }
+ memcpy(state, bb_reg_state, size);
+ state->ref_count = 1;
+ bb_jmp->state = state;
+ KDB_DEBUG_BB(" new state %p\n", state);
+}
+
+/* Isolate the processing for 'mov' so it can be used for 'xadd'/'xchg' as
+ * well.
+ *
+ * xadd/xchg expect this function to return BBOU_NOP for special cases,
+ * otherwise it returns BBOU_RSWD. All special cases must be handled entirely
+ * within this function, including doing bb_read_operand or bb_write_operand
+ * where necessary.
+ */
+
+static enum bb_operand_usage
+bb_usage_mov(const struct bb_operand *src, const struct bb_operand *dst, int l)
+{
+ int full_register_src, full_register_dst;
+ full_register_src = bb_operand_length(src, bb_decode.opcode[l])
+ == KDB_WORD_SIZE * 8;
+ full_register_dst = bb_operand_length(dst, bb_decode.opcode[l])
+ == KDB_WORD_SIZE * 8;
+ /* If both src and dst are full integer registers then record the
+ * register change.
+ */
+ if (src->reg &&
+ bb_is_int_reg(src->base_rc) &&
+ dst->reg &&
+ bb_is_int_reg(dst->base_rc) &&
+ full_register_src &&
+ full_register_dst) {
+ /* Special case for the code that switches stacks in
+ * jprobe_return. That code must modify RSP but it does it in
+ * a well defined manner. Do not invalidate RSP.
+ */
+ if (src->base_rc == BBRG_RBX &&
+ dst->base_rc == BBRG_RSP &&
+ strcmp(bb_func_name, "jprobe_return") == 0) {
+ bb_read_operand(src);
+ return BBOU_NOP;
+ }
+ /* math_abort takes the equivalent of a longjmp structure and
+ * resets the stack. Ignore this, it leaves RSP well defined.
+ */
+ if (dst->base_rc == BBRG_RSP &&
+ strcmp(bb_func_name, "math_abort") == 0) {
+ bb_read_operand(src);
+ return BBOU_NOP;
+ }
+ bb_reg_set_reg(dst->base_rc, src->base_rc);
+ return BBOU_NOP;
+ }
+ /* If the move is from a full integer register to stack then record it.
+ */
+ if (src->reg &&
+ bb_is_simple_memory(dst) &&
+ bb_is_osp_defined(dst->base_rc) &&
+ full_register_src) {
+ /* Ugly special case. Initializing list heads on stack causes
+ * false references to stack variables when the list head is
+ * used. Static code analysis cannot detect that the list head
+ * has been changed by a previous execution loop and that a
+ * basic block is only executed after the list head has been
+ * changed.
+ *
+ * These false references can result in valid stack variables
+ * being incorrectly cleared on some logic paths. Ignore
+ * stores to stack variables which point to themselves or to
+ * the previous word so the list head initialization is not
+ * recorded.
+ */
+ if (bb_is_osp_defined(src->base_rc)) {
+ int stack1 = bb_reg_code_offset(src->base_rc);
+ int stack2 = bb_reg_code_offset(dst->base_rc) +
+ dst->disp;
+ if (stack1 == stack2 ||
+ stack1 == stack2 - KDB_WORD_SIZE)
+ return BBOU_NOP;
+ }
+ bb_memory_set_reg(dst->base_rc, src->base_rc, dst->disp);
+ return BBOU_NOP;
+ }
+ /* If the move is from stack to a full integer register then record it.
+ */
+ if (bb_is_simple_memory(src) &&
+ bb_is_osp_defined(src->base_rc) &&
+ dst->reg &&
+ bb_is_int_reg(dst->base_rc) &&
+ full_register_dst) {
+#ifdef CONFIG_X86_32
- #ifndef TSS_sysenter_sp0
- #define TSS_sysenter_sp0 SYSENTER_stack_sp0
- #endif
+ /* mov from TSS_sysenter_sp0+offset to esp to fix up the
+ * sysenter stack, it leaves esp well defined. mov
+ * TSS_ysenter_sp0+offset(%esp),%esp is followed by up to 5
+ * push instructions to mimic the hardware stack push. If
+ * TSS_sysenter_sp0 is offset then only 3 words will be
+ * pushed.
+ */
+ if (dst->base_rc == BBRG_RSP &&
+ src->disp >= TSS_sysenter_sp0 &&
+ bb_is_osp_defined(BBRG_RSP)) {
+ int pushes;
+ pushes = src->disp == TSS_sysenter_sp0 ? 5 : 3;
+ bb_reg_code_set_offset(BBRG_RSP,
+ bb_reg_code_offset(BBRG_RSP) +
+ pushes * KDB_WORD_SIZE);
+ KDB_DEBUG_BB_OFFSET(
+ bb_reg_code_offset(BBRG_RSP),
+ " sysenter fixup, RSP",
+ "\n");
+ return BBOU_NOP;
+ }
+#endif /* CONFIG_X86_32 */
+ bb_read_operand(src);
+ bb_reg_set_memory(dst->base_rc, src->base_rc, src->disp);
+ return BBOU_NOP;
+ }
+ /* move %gs:0x<nn>,%rsp is used to unconditionally switch to another
+ * stack. Ignore this special case, it is handled by the stack
+ * unwinding code.
+ */
+ if (src->segment &&
+ strcmp(src->segment, "%gs") == 0 &&
+ dst->reg &&
+ dst->base_rc == BBRG_RSP)
+ return BBOU_NOP;
+ /* move %reg,%reg is a nop */
+ if (src->reg &&
+ dst->reg &&
+ !src->segment &&
+ !dst->segment &&
+ strcmp(src->base, dst->base) == 0)
+ return BBOU_NOP;
+ /* Special case for the code that switches stacks in the scheduler
+ * (switch_to()). That code must modify RSP but it does it in a well
+ * defined manner. Do not invalidate RSP.
+ */
+ if (dst->reg &&
+ dst->base_rc == BBRG_RSP &&
+ full_register_dst &&
+ bb_is_scheduler_address()) {
+ bb_read_operand(src);
+ return BBOU_NOP;
+ }
+ /* Special case for the code that switches stacks in resume from
+ * hibernation code. That code must modify RSP but it does it in a
+ * well defined manner. Do not invalidate RSP.
+ */
+ if (src->memory &&
+ dst->reg &&
+ dst->base_rc == BBRG_RSP &&
+ full_register_dst &&
+ strcmp(bb_func_name, "restore_image") == 0) {
+ bb_read_operand(src);
+ return BBOU_NOP;
+ }
+ return BBOU_RSWD;
+}
+
+static enum bb_operand_usage
+bb_usage_xadd(const struct bb_operand *src, const struct bb_operand *dst)
+{
+ /* Simulate xadd as a series of instructions including mov, that way we
+ * get the benefit of all the special cases already handled by
+ * BBOU_MOV.
+ *
+ * tmp = src + dst, src = dst, dst = tmp.
+ *
+ * For tmp, pick a register that is undefined. If all registers are
+ * defined then pick one that is not being used by xadd.
+ */
+ enum bb_reg_code reg = BBRG_UNDEFINED;
+ struct bb_operand tmp;
+ struct bb_reg_contains save_tmp;
+ enum bb_operand_usage usage;
+ int undefined = 0;
+ for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ if (bb_reg_code_value(reg) == BBRG_UNDEFINED) {
+ undefined = 1;
+ break;
+ }
+ }
+ if (!undefined) {
+ for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ if (reg != src->base_rc &&
+ reg != src->index_rc &&
+ reg != dst->base_rc &&
+ reg != dst->index_rc &&
+ reg != BBRG_RSP)
+ break;
+ }
+ }
+ KDB_DEBUG_BB(" %s saving tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ save_tmp = bb_reg_state->contains[reg - BBRG_RAX];
+ bb_reg_set_undef(reg);
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.present = 1;
+ tmp.reg = 1;
+ tmp.base = debug_kmalloc(strlen(bbrg_name[reg]) + 2, GFP_ATOMIC);
+ if (tmp.base) {
+ tmp.base[0] = '%';
+ strcpy(tmp.base + 1, bbrg_name[reg]);
+ }
+ tmp.base_rc = reg;
+ bb_read_operand(src);
+ bb_read_operand(dst);
+ if (bb_usage_mov(src, dst, sizeof("xadd")-1) == BBOU_NOP)
+ usage = BBOU_RSRD;
+ else
+ usage = BBOU_RSRDWS;
+ bb_usage_mov(&tmp, dst, sizeof("xadd")-1);
+ KDB_DEBUG_BB(" %s restoring tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ bb_reg_state->contains[reg - BBRG_RAX] = save_tmp;
+ debug_kfree(tmp.base);
+ return usage;
+}
+
+static enum bb_operand_usage
+bb_usage_xchg(const struct bb_operand *src, const struct bb_operand *dst)
+{
+ /* Simulate xchg as a series of mov instructions, that way we get the
+ * benefit of all the special cases already handled by BBOU_MOV.
+ *
+ * mov dst,tmp; mov src,dst; mov tmp,src;
+ *
+ * For tmp, pick a register that is undefined. If all registers are
+ * defined then pick one that is not being used by xchg.
+ */
+ enum bb_reg_code reg = BBRG_UNDEFINED;
+ int rs = BBOU_RS, rd = BBOU_RD, ws = BBOU_WS, wd = BBOU_WD;
+ struct bb_operand tmp;
+ struct bb_reg_contains save_tmp;
+ int undefined = 0;
+ for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ if (bb_reg_code_value(reg) == BBRG_UNDEFINED) {
+ undefined = 1;
+ break;
+ }
+ }
+ if (!undefined) {
+ for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ if (reg != src->base_rc &&
+ reg != src->index_rc &&
+ reg != dst->base_rc &&
+ reg != dst->index_rc &&
+ reg != BBRG_RSP)
+ break;
+ }
+ }
+ KDB_DEBUG_BB(" %s saving tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ save_tmp = bb_reg_state->contains[reg - BBRG_RAX];
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.present = 1;
+ tmp.reg = 1;
+ tmp.base = debug_kmalloc(strlen(bbrg_name[reg]) + 2, GFP_ATOMIC);
+ if (tmp.base) {
+ tmp.base[0] = '%';
+ strcpy(tmp.base + 1, bbrg_name[reg]);
+ }
+ tmp.base_rc = reg;
+ if (bb_usage_mov(dst, &tmp, sizeof("xchg")-1) == BBOU_NOP)
+ rd = 0;
+ if (bb_usage_mov(src, dst, sizeof("xchg")-1) == BBOU_NOP) {
+ rs = 0;
+ wd = 0;
+ }
+ if (bb_usage_mov(&tmp, src, sizeof("xchg")-1) == BBOU_NOP)
+ ws = 0;
+ KDB_DEBUG_BB(" %s restoring tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ bb_reg_state->contains[reg - BBRG_RAX] = save_tmp;
+ debug_kfree(tmp.base);
+ return rs | rd | ws | wd;
+}
+
+/* Invalidate all the scratch registers */
+
+static void
+bb_invalidate_scratch_reg(void)
+{
+ int i, j;
+ for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
+ for (j = 0; j < ARRAY_SIZE(bb_preserved_reg); ++j) {
+ if (i == bb_preserved_reg[j])
+ goto preserved;
+ }
+ bb_reg_set_undef(i);
+preserved:
+ continue;
+ }
+}
+
+static void
+bb_pass2_computed_jmp(const struct bb_operand *src)
+{
+ unsigned long table = src->disp;
+ kdb_machreg_t addr;
+ while (!bb_giveup) {
+ if (kdb_getword(&addr, table, sizeof(addr)))
+ return;
+ if (addr < bb_func_start || addr >= bb_func_end)
+ return;
+ bb_transfer(bb_curr_addr, addr, 0);
+ table += KDB_WORD_SIZE;
+ }
+}
+
+/* The current instruction has been decoded and all the information is in
+ * bb_decode. Based on the opcode, track any operand usage that we care about.
+ */
+
+static void
+bb_usage(void)
+{
+ enum bb_operand_usage usage = bb_decode.match->usage;
+ struct bb_operand *src = &bb_decode.src;
+ struct bb_operand *dst = &bb_decode.dst;
+ struct bb_operand *dst2 = &bb_decode.dst2;
+ int opcode_suffix, operand_length;
+
+ /* First handle all the special usage cases, and map them to a generic
+ * case after catering for the side effects.
+ */
+
+ if (usage == BBOU_IMUL &&
+ src->present && !dst->present && !dst2->present) {
+ /* single operand imul, same effects as mul */
+ usage = BBOU_MUL;
+ }
+
+ /* AT&T syntax uses movs<l1><l2> for move with sign extension, instead
+ * of the Intel movsx. The AT&T syntax causes problems for the opcode
+ * mapping; movs with sign extension needs to be treated as a generic
+ * read src, write dst, but instead it falls under the movs I/O
+ * instruction. Fix it.
+ */
+ if (usage == BBOU_MOVS && strlen(bb_decode.opcode) > 5)
+ usage = BBOU_RSWD;
+
+ /* This switch statement deliberately does not use 'default' at the top
+ * level. That way the compiler will complain if a new BBOU_ enum is
+ * added above and not explicitly handled here.
+ */
+ switch (usage) {
+ case BBOU_UNKNOWN: /* drop through */
+ case BBOU_RS: /* drop through */
+ case BBOU_RD: /* drop through */
+ case BBOU_RSRD: /* drop through */
+ case BBOU_WS: /* drop through */
+ case BBOU_RSWS: /* drop through */
+ case BBOU_RDWS: /* drop through */
+ case BBOU_RSRDWS: /* drop through */
+ case BBOU_WD: /* drop through */
+ case BBOU_RSWD: /* drop through */
+ case BBOU_RDWD: /* drop through */
+ case BBOU_RSRDWD: /* drop through */
+ case BBOU_WSWD: /* drop through */
+ case BBOU_RSWSWD: /* drop through */
+ case BBOU_RDWSWD: /* drop through */
+ case BBOU_RSRDWSWD:
+ break; /* ignore generic usage for now */
+ case BBOU_ADD:
+ /* Special case for add instructions that adjust registers
+ * which are mapping the stack.
+ */
+ if (dst->reg && bb_is_osp_defined(dst->base_rc)) {
+ bb_adjust_osp_instruction(1);
+ usage = BBOU_RS;
+ } else {
+ usage = BBOU_RSRDWD;
+ }
+ break;
+ case BBOU_AND:
+ /* Special case when trying to round the stack pointer
+ * to achieve byte alignment
+ */
+ if (dst->reg && dst->base_rc == BBRG_RSP &&
+ src->immediate && strncmp(bb_func_name, "efi_call", 8) == 0) {
+ usage = BBOU_NOP;
+ } else {
+ usage = BBOU_RSRDWD;
+ }
+ break;
+ case BBOU_CALL:
+ bb_reg_state_print(bb_reg_state);
+ usage = BBOU_NOP;
+ if (bb_is_static_disp(src)) {
+ /* save_args is special. It saves
+ * a partial pt_regs onto the stack and switches
+ * to the interrupt stack.
+ */
+ if (src->disp == bb_save_args) {
+ bb_memory_set_reg(BBRG_RSP, BBRG_RDI, 0x48);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RSI, 0x40);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RDX, 0x38);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RCX, 0x30);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RAX, 0x28);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R8, 0x20);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R9, 0x18);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R10, 0x10);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R11, 0x08);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0);
+ /* This is actually on the interrupt stack,
+ * but we fudge it so the unwind works.
+ */
+ bb_memory_set_reg_value(BBRG_RSP, -0x8, BBRG_RBP, 0);
+ bb_reg_set_reg(BBRG_RBP, BBRG_RSP);
+ bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
+ }
+ /* save_rest juggles the stack frame to append the
+ * rest of the pt_regs onto a stack where SAVE_ARGS
+ * or save_args has already been done.
+ */
+ else if (src->disp == bb_save_rest) {
+ bb_memory_set_reg(BBRG_RSP, BBRG_RBX, 0x30);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0x28);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R12, 0x20);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R13, 0x18);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R14, 0x10);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R15, 0x08);
+ }
+ /* error_entry and save_paranoid save a full pt_regs.
+ * Break out so the scratch registers aren't invalidated.
+ */
+ else if (src->disp == bb_error_entry || src->disp == bb_save_paranoid) {
+ bb_memory_set_reg(BBRG_RSP, BBRG_RDI, 0x70);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RSI, 0x68);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RDX, 0x60);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RCX, 0x58);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RAX, 0x50);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R8, 0x48);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R9, 0x40);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R10, 0x38);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R11, 0x30);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RBX, 0x28);
+ bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0x20);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R12, 0x18);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R13, 0x10);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R14, 0x08);
+ bb_memory_set_reg(BBRG_RSP, BBRG_R15, 0);
+ break;
+ }
+ }
+ /* Invalidate the scratch registers */
+ bb_invalidate_scratch_reg();
+
+ /* These special cases need scratch registers invalidated first */
+ if (bb_is_static_disp(src)) {
+ /* Function sync_regs and save_v86_state are special.
+ * Their return value is the new stack pointer
+ */
+ if (src->disp == bb_sync_regs) {
+ bb_reg_set_reg(BBRG_RAX, BBRG_RSP);
+ } else if (src->disp == bb_save_v86_state) {
+ bb_reg_set_reg(BBRG_RAX, BBRG_RSP);
+ bb_adjust_osp(BBRG_RAX, +KDB_WORD_SIZE);
+ }
+ }
+ break;
+ case BBOU_CBW:
+ /* Convert word in RAX. Read RAX, write RAX */
+ bb_reg_read(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RAX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_CMOV:
+ /* cmove %gs:0x<nn>,%rsp is used to conditionally switch to
+ * another stack. Ignore this special case, it is handled by
+ * the stack unwinding code.
+ */
+ if (src->segment &&
+ strcmp(src->segment, "%gs") == 0 &&
+ dst->reg &&
+ dst->base_rc == BBRG_RSP)
+ usage = BBOU_NOP;
+ else
+ usage = BBOU_RSWD;
+ break;
+ case BBOU_CMPXCHG:
+ /* Read RAX, write RAX plus src read, dst write */
+ bb_reg_read(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RAX);
+ usage = BBOU_RSWD;
+ break;
+ case BBOU_CMPXCHGD:
+ /* Read RAX, RBX, RCX, RDX, write RAX, RDX plus src read/write */
+ bb_reg_read(BBRG_RAX);
+ bb_reg_read(BBRG_RBX);
+ bb_reg_read(BBRG_RCX);
+ bb_reg_read(BBRG_RDX);
+ bb_reg_set_undef(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RDX);
+ usage = BBOU_RSWS;
+ break;
+ case BBOU_CPUID:
+ /* Read RAX, write RAX, RBX, RCX, RDX */
+ bb_reg_read(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RBX);
+ bb_reg_set_undef(BBRG_RCX);
+ bb_reg_set_undef(BBRG_RDX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_CWD:
+ /* Convert word in RAX, RDX. Read RAX, write RDX */
+ bb_reg_read(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RDX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_DIV: /* drop through */
+ case BBOU_IDIV:
+ /* The 8 bit variants only affect RAX, the 16, 32 and 64 bit
+ * variants affect RDX as well.
+ */
+ switch (usage) {
+ case BBOU_DIV:
+ opcode_suffix = bb_decode.opcode[3];
+ break;
+ case BBOU_IDIV:
+ opcode_suffix = bb_decode.opcode[4];
+ break;
+ default:
+ opcode_suffix = 'q';
+ break;
+ }
+ operand_length = bb_operand_length(src, opcode_suffix);
+ bb_reg_read(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RAX);
+ if (operand_length != 8) {
+ bb_reg_read(BBRG_RDX);
+ bb_reg_set_undef(BBRG_RDX);
+ }
+ usage = BBOU_RS;
+ break;
+ case BBOU_IMUL:
+ /* Only the two and three operand forms get here. The one
+ * operand form is treated as mul.
+ */
+ if (dst2->present) {
+ /* The three operand form is a special case, read the first two
+ * operands, write the third.
+ */
+ bb_read_operand(src);
+ bb_read_operand(dst);
+ bb_write_operand(dst2);
+ usage = BBOU_NOP;
+ } else {
+ usage = BBOU_RSRDWD;
+ }
+ break;
+ case BBOU_IRET:
+ bb_sanity_check(0);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_JMP:
+ if (bb_is_static_disp(src))
+ bb_transfer(bb_curr_addr, src->disp, 0);
+ else if (src->indirect &&
+ src->disp &&
+ src->base == NULL &&
+ src->index &&
+ src->scale == KDB_WORD_SIZE)
+ bb_pass2_computed_jmp(src);
+ usage = BBOU_RS;
+ break;
+ case BBOU_LAHF:
+ /* Write RAX */
+ bb_reg_set_undef(BBRG_RAX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_LEA:
+ /* dst = src + disp. Often used to calculate offsets into the
+ * stack, so check if it uses a stack pointer.
+ */
+ usage = BBOU_RSWD;
+ if (bb_is_simple_memory(src)) {
+ if (bb_is_osp_defined(src->base_rc)) {
+ bb_reg_set_reg(dst->base_rc, src->base_rc);
+ bb_adjust_osp_instruction(1);
+ usage = BBOU_RS;
+ } else if (src->disp == 0 &&
+ src->base_rc == dst->base_rc) {
+ /* lea 0(%reg),%reg is generated by i386
+ * GENERIC_NOP7.
+ */
+ usage = BBOU_NOP;
+ } else if (src->disp == 4096 &&
+ (src->base_rc == BBRG_R8 ||
+ src->base_rc == BBRG_RDI) &&
+ strcmp(bb_func_name, "relocate_kernel") == 0) {
+ /* relocate_kernel: setup a new stack at the
+ * end of the physical control page, using
+ * (x86_64) lea 4096(%r8),%rsp or (i386) lea
+ * 4096(%edi),%esp
+ */
+ usage = BBOU_NOP;
+ }
+ }
+ break;
+ case BBOU_LEAVE:
+ /* RSP = RBP; RBP = *(RSP); RSP += KDB_WORD_SIZE; */
+ bb_reg_set_reg(BBRG_RSP, BBRG_RBP);
+ if (bb_is_osp_defined(BBRG_RSP))
+ bb_reg_set_memory(BBRG_RBP, BBRG_RSP, 0);
+ else
+ bb_reg_set_undef(BBRG_RBP);
+ if (bb_is_osp_defined(BBRG_RSP))
+ bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
+ /* common_interrupt uses leave in a non-standard manner */
+ if (strcmp(bb_func_name, "common_interrupt") != 0)
+ bb_sanity_check(0);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_LODS:
+ /* Read RSI, write RAX, RSI */
+ bb_reg_read(BBRG_RSI);
+ bb_reg_set_undef(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RSI);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_LOOP:
+ /* Read and write RCX */
+ bb_reg_read(BBRG_RCX);
+ bb_reg_set_undef(BBRG_RCX);
+ if (bb_is_static_disp(src))
+ bb_transfer(bb_curr_addr, src->disp, 0);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_LSS:
+ /* lss offset(%esp),%esp leaves esp well defined */
+ if (dst->reg &&
+ dst->base_rc == BBRG_RSP &&
+ bb_is_simple_memory(src) &&
+ src->base_rc == BBRG_RSP) {
+ bb_adjust_osp(BBRG_RSP, 2*KDB_WORD_SIZE + src->disp);
+ usage = BBOU_NOP;
+ } else {
+ usage = BBOU_RSWD;
+ }
+ break;
+ case BBOU_MONITOR:
+ /* Read RAX, RCX, RDX */
+ bb_reg_set_undef(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RCX);
+ bb_reg_set_undef(BBRG_RDX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_MOV:
+ usage = bb_usage_mov(src, dst, sizeof("mov")-1);
+ break;
+ case BBOU_MOVS:
+ /* Read RSI, RDI, write RSI, RDI */
+ bb_reg_read(BBRG_RSI);
+ bb_reg_read(BBRG_RDI);
+ bb_reg_set_undef(BBRG_RSI);
+ bb_reg_set_undef(BBRG_RDI);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_MUL:
+ /* imul (one operand form only) or mul. Read RAX. If the
+ * operand length is not 8 then write RDX.
+ */
+ if (bb_decode.opcode[0] == 'i')
+ opcode_suffix = bb_decode.opcode[4];
+ else
+ opcode_suffix = bb_decode.opcode[3];
+ operand_length = bb_operand_length(src, opcode_suffix);
+ bb_reg_read(BBRG_RAX);
+ if (operand_length != 8)
+ bb_reg_set_undef(BBRG_RDX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_MWAIT:
+ /* Read RAX, RCX */
+ bb_reg_read(BBRG_RAX);
+ bb_reg_read(BBRG_RCX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_NOP:
+ break;
+ case BBOU_OUTS:
+ /* Read RSI, RDX, write RSI */
+ bb_reg_read(BBRG_RSI);
+ bb_reg_read(BBRG_RDX);
+ bb_reg_set_undef(BBRG_RSI);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_POP:
+ /* Complicated by the fact that you can pop from top of stack
+ * to a stack location, for this case the destination location
+ * is calculated after adjusting RSP. Analysis of the kernel
+ * code shows that gcc only uses this strange format to get the
+ * flags into a local variable, e.g. pushf; popl 0x10(%esp); so
+ * I am going to ignore this special case.
+ */
+ usage = BBOU_WS;
+ if (!bb_is_osp_defined(BBRG_RSP)) {
+ if (!bb_is_scheduler_address()) {
+ kdb_printf("pop when BBRG_RSP is undefined?\n");
+ bb_giveup = 1;
+ }
+ } else {
+ if (src->reg) {
+ bb_reg_set_memory(src->base_rc, BBRG_RSP, 0);
+ usage = BBOU_NOP;
+ }
+ /* pop %rsp does not adjust rsp */
+ if (!src->reg ||
+ src->base_rc != BBRG_RSP)
+ bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
+ }
+ break;
+ case BBOU_POPF:
+ /* Do not care about flags, just adjust RSP */
+ if (!bb_is_osp_defined(BBRG_RSP)) {
+ if (!bb_is_scheduler_address()) {
+ kdb_printf("popf when BBRG_RSP is undefined?\n");
+ bb_giveup = 1;
+ }
+ } else {
+ bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
+ }
+ usage = BBOU_WS;
+ break;
+ case BBOU_PUSH:
+ /* Complicated by the fact that you can push from a stack
+ * location to top of stack, the source location is calculated
+ * before adjusting RSP. Analysis of the kernel code shows
+ * that gcc only uses this strange format to restore the flags
+ * from a local variable, e.g. pushl 0x10(%esp); popf; so I am
+ * going to ignore this special case.
+ */
+ usage = BBOU_RS;
+ if (!bb_is_osp_defined(BBRG_RSP)) {
+ if (!bb_is_scheduler_address()) {
+ kdb_printf("push when BBRG_RSP is undefined?\n");
+ bb_giveup = 1;
+ }
+ } else {
+ bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
+ if (src->reg &&
+ bb_reg_code_offset(BBRG_RSP) <= 0)
+ bb_memory_set_reg(BBRG_RSP, src->base_rc, 0);
+ }
+ break;
+ case BBOU_PUSHF:
+ /* Do not care about flags, just adjust RSP */
+ if (!bb_is_osp_defined(BBRG_RSP)) {
+ if (!bb_is_scheduler_address()) {
+ kdb_printf("pushf when BBRG_RSP is undefined?\n");
+ bb_giveup = 1;
+ }
+ } else {
+ bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
+ }
+ usage = BBOU_WS;
+ break;
+ case BBOU_RDMSR:
+ /* Read RCX, write RAX, RDX */
+ bb_reg_read(BBRG_RCX);
+ bb_reg_set_undef(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RDX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_RDTSC:
+ /* Write RAX, RDX */
+ bb_reg_set_undef(BBRG_RAX);
+ bb_reg_set_undef(BBRG_RDX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_RET:
+ usage = BBOU_NOP;
+ if (src->immediate && bb_is_osp_defined(BBRG_RSP)) {
+ bb_adjust_osp(BBRG_RSP, src->disp);
+ }
+ /* Functions that restore state which was saved by another
+ * function or build new kernel stacks. We cannot verify what
+ * is being restored so skip the sanity check.
+ */
+ if (strcmp(bb_func_name, "restore_image") == 0 ||
+ strcmp(bb_func_name, "relocate_kernel") == 0 ||
+ strcmp(bb_func_name, "identity_mapped") == 0 ||
+ strcmp(bb_func_name, "xen_iret_crit_fixup") == 0 ||
+ strcmp(bb_func_name, "math_abort") == 0 ||
+ strcmp(bb_func_name, "save_args") == 0 ||
+ strcmp(bb_func_name, "kretprobe_trampoline_holder") == 0)
+ break;
+ bb_sanity_check(0);
+ break;
+ case BBOU_SAHF:
+ /* Read RAX */
+ bb_reg_read(BBRG_RAX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_SCAS:
+ /* Read RAX, RDI, write RDI */
+ bb_reg_read(BBRG_RAX);
+ bb_reg_read(BBRG_RDI);
+ bb_reg_set_undef(BBRG_RDI);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_SUB:
+ /* Special case for sub instructions that adjust registers
+ * which are mapping the stack.
+ */
+ if (dst->reg && bb_is_osp_defined(dst->base_rc)) {
+ bb_adjust_osp_instruction(-1);
+ usage = BBOU_RS;
+ } else {
+ usage = BBOU_RSRDWD;
+ }
+ break;
+ case BBOU_SYSEXIT:
+ bb_sanity_check(1);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_SYSRET:
+ bb_sanity_check(1);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_WRMSR:
+ /* Read RCX, RAX, RDX */
+ bb_reg_read(BBRG_RCX);
+ bb_reg_read(BBRG_RAX);
+ bb_reg_read(BBRG_RDX);
+ usage = BBOU_NOP;
+ break;
+ case BBOU_XADD:
+ usage = bb_usage_xadd(src, dst);
+ break;
+ case BBOU_XCHG:
+ /* i386 do_IRQ with 4K stacks does xchg %ebx,%esp; call
+ * irq_handler; mov %ebx,%esp; to switch stacks. Ignore this
+ * stack switch when tracking registers, it is handled by
+ * higher level backtrace code. Convert xchg %ebx,%esp to mov
+ * %esp,%ebx so the later mov %ebx,%esp becomes a NOP and the
+ * stack remains defined so we can backtrace through do_IRQ's
+ * stack switch.
+ *
+ * Ditto for do_softirq.
+ */
+ if (src->reg &&
+ dst->reg &&
+ src->base_rc == BBRG_RBX &&
+ dst->base_rc == BBRG_RSP &&
+ (strcmp(bb_func_name, "do_IRQ") == 0 ||
+ strcmp(bb_func_name, "do_softirq") == 0)) {
+ strcpy(bb_decode.opcode, "mov");
+ usage = bb_usage_mov(dst, src, sizeof("mov")-1);
+ } else {
+ usage = bb_usage_xchg(src, dst);
+ }
+ break;
+ case BBOU_XOR:
+ /* xor %reg,%reg only counts as a register write, the original
+ * contents of reg are irrelevant.
+ */
+ if (src->reg && dst->reg && src->base_rc == dst->base_rc)
+ usage = BBOU_WS;
+ else
+ usage = BBOU_RSRDWD;
+ break;
+ }
+
+ /* The switch statement above handled all the special cases. Every
+ * opcode should now have a usage of NOP or one of the generic cases.
+ */
+ if (usage == BBOU_UNKNOWN || usage == BBOU_NOP) {
+ /* nothing to do */
+ } else if (usage >= BBOU_RS && usage <= BBOU_RSRDWSWD) {
+ if (usage & BBOU_RS)
+ bb_read_operand(src);
+ if (usage & BBOU_RD)
+ bb_read_operand(dst);
+ if (usage & BBOU_WS)
+ bb_write_operand(src);
+ if (usage & BBOU_WD)
+ bb_write_operand(dst);
+ } else {
+ kdb_printf("%s: opcode not fully handled\n", __FUNCTION__);
+ if (!KDB_DEBUG(BB)) {
+ bb_print_opcode();
+ if (bb_decode.src.present)
+ bb_print_operand("src", &bb_decode.src);
+ if (bb_decode.dst.present)
+ bb_print_operand("dst", &bb_decode.dst);
+ if (bb_decode.dst2.present)
+ bb_print_operand("dst2", &bb_decode.dst2);
+ }
+ bb_giveup = 1;
+ }
+}
+
+static void
+bb_parse_buffer(void)
+{
+ char *p, *src, *dst = NULL, *dst2 = NULL;
+ int paren = 0;
+ p = bb_buffer;
+ memset(&bb_decode, 0, sizeof(bb_decode));
+ KDB_DEBUG_BB(" '%s'\n", p);
+ p += strcspn(p, ":"); /* skip address and function name+offset: */
+ if (*p++ != ':') {
+ kdb_printf("%s: cannot find ':' in buffer '%s'\n",
+ __FUNCTION__, bb_buffer);
+ bb_giveup = 1;
+ return;
+ }
+ p += strspn(p, " \t"); /* step to opcode */
+ if (strncmp(p, "(bad)", 5) == 0)
+ strcpy(p, "nop");
+ /* separate any opcode prefix */
+ if (strncmp(p, "lock", 4) == 0 ||
+ strncmp(p, "rep", 3) == 0 ||
+ strncmp(p, "rex", 3) == 0 ||
+ strncmp(p, "addr", 4) == 0) {
+ bb_decode.prefix = p;
+ p += strcspn(p, " \t");
+ *p++ = '\0';
+ p += strspn(p, " \t");
+ }
+ bb_decode.opcode = p;
+ strsep(&p, " \t"); /* step to end of opcode */
+ if (bb_parse_opcode())
+ return;
+ if (!p)
+ goto no_operands;
+ p += strspn(p, " \t"); /* step to operand(s) */
+ if (!*p)
+ goto no_operands;
+ src = p;
+ p = strsep(&p, " \t"); /* strip comments after operands */
+ /* split 'src','dst' but ignore ',' inside '(' ')' */
+ while (*p) {
+ if (*p == '(') {
+ ++paren;
+ } else if (*p == ')') {
+ --paren;
+ } else if (*p == ',' && paren == 0) {
+ *p = '\0';
+ if (dst)
+ dst2 = p+1;
+ else
+ dst = p+1;
+ }
+ ++p;
+ }
+ bb_parse_operand(src, &bb_decode.src);
+ if (KDB_DEBUG(BB))
+ bb_print_operand("src", &bb_decode.src);
+ if (dst && !bb_giveup) {
+ bb_parse_operand(dst, &bb_decode.dst);
+ if (KDB_DEBUG(BB))
+ bb_print_operand("dst", &bb_decode.dst);
+ }
+ if (dst2 && !bb_giveup) {
+ bb_parse_operand(dst2, &bb_decode.dst2);
+ if (KDB_DEBUG(BB))
+ bb_print_operand("dst2", &bb_decode.dst2);
+ }
+no_operands:
+ if (!bb_giveup)
+ bb_usage();
+}
+
+static int
+bb_dis_pass2(PTR file, const char *fmt, ...)
+{
+ char *p;
+ int l = strlen(bb_buffer);
+ va_list ap;
+ va_start(ap, fmt);
+ vsnprintf(bb_buffer + l, sizeof(bb_buffer) - l, fmt, ap);
+ va_end(ap);
+ if ((p = strchr(bb_buffer, '\n'))) {
+ *p = '\0';
+ p = bb_buffer;
+ p += strcspn(p, ":");
+ if (*p++ == ':')
+ bb_fixup_switch_to(p);
+ bb_parse_buffer();
+ bb_buffer[0] = '\0';
+ }
+ return 0;
+}
+
+static void
+bb_printaddr_pass2(bfd_vma addr, disassemble_info *dip)
+{
+ kdb_symtab_t symtab;
+ unsigned int offset;
+ dip->fprintf_func(dip->stream, "0x%lx", addr);
+ kdbnearsym(addr, &symtab);
+ if (symtab.sym_name) {
+ dip->fprintf_func(dip->stream, " <%s", symtab.sym_name);
+ if ((offset = addr - symtab.sym_start))
+ dip->fprintf_func(dip->stream, "+0x%x", offset);
+ dip->fprintf_func(dip->stream, ">");
+ }
+}
+
+/* Set the starting register and memory state for the current bb */
+
+static void
+bb_start_block0_special(void)
+{
+ int i;
+ short offset_address;
+ enum bb_reg_code reg, value;
+ struct bb_name_state *r;
+ for (i = 0, r = bb_special_cases;
+ i < ARRAY_SIZE(bb_special_cases);
+ ++i, ++r) {
+ if (bb_func_start == r->address && r->fname == NULL)
+ goto match;
+ }
+ return;
+match:
+ /* Set the running registers */
+ for (reg = BBRG_RAX; reg < r->regs_size; ++reg) {
+ value = r->regs[reg].value;
+ if (test_bit(value, r->skip_regs.bits)) {
+ /* this regs entry is not defined for this label */
+ continue;
+ }
+ bb_reg_code_set_value(reg, value);
+ bb_reg_code_set_offset(reg, r->regs[reg].offset);
+ }
+ /* Set any memory contents, e.g. pt_regs. Adjust RSP as required. */
+ offset_address = 0;
+ for (i = 0; i < r->mem_size; ++i) {
+ offset_address = max_t(int,
+ r->mem[i].offset_address + KDB_WORD_SIZE,
+ offset_address);
+ }
+ if (bb_reg_code_offset(BBRG_RSP) > -offset_address)
+ bb_adjust_osp(BBRG_RSP, -offset_address - bb_reg_code_offset(BBRG_RSP));
+ for (i = 0; i < r->mem_size; ++i) {
+ value = r->mem[i].value;
+ if (test_bit(value, r->skip_mem.bits)) {
+ /* this memory entry is not defined for this label */
+ continue;
+ }
+ bb_memory_set_reg_value(BBRG_RSP, r->mem[i].offset_address,
+ value, 0);
+ bb_reg_set_undef(value);
+ }
+ return;
+}
+
+static void
+bb_pass2_start_block(int number)
+{
+ int i, j, k, first, changed;
+ size_t size;
+ struct bb_jmp *bb_jmp;
+ struct bb_reg_state *state;
+ struct bb_memory_contains *c1, *c2;
+ bb_reg_state->mem_count = bb_reg_state_max;
+ size = bb_reg_state_size(bb_reg_state);
+ memset(bb_reg_state, 0, size);
+
+ if (number == 0) {
+ /* The first block is assumed to have well defined inputs */
+ bb_start_block0();
+ /* Some assembler labels have non-standard entry
+ * states.
+ */
+ bb_start_block0_special();
+ bb_reg_state_print(bb_reg_state);
+ return;
+ }
+
+ /* Merge all the input states for the current bb together */
+ first = 1;
+ changed = 0;
+ for (i = 0; i < bb_jmp_count; ++i) {
+ bb_jmp = bb_jmp_list + i;
+ if (bb_jmp->to != bb_curr->start)
+ continue;
+ state = bb_jmp->state;
+ if (!state)
+ continue;
+ if (first) {
+ size = bb_reg_state_size(state);
+ memcpy(bb_reg_state, state, size);
+ KDB_DEBUG_BB(" first state %p\n", state);
+ bb_reg_state_print(bb_reg_state);
+ first = 0;
+ continue;
+ }
+
+ KDB_DEBUG_BB(" merging state %p\n", state);
+ /* Merge the register states */
+ for (j = 0; j < ARRAY_SIZE(state->contains); ++j) {
+ if (memcmp(bb_reg_state->contains + j,
+ state->contains + j,
+ sizeof(bb_reg_state->contains[0]))) {
+ /* Different states for this register from two
+ * or more inputs, make it undefined.
+ */
+ if (bb_reg_state->contains[j].value ==
+ BBRG_UNDEFINED) {
+ KDB_DEBUG_BB(" ignoring %s\n",
+ bbrg_name[j + BBRG_RAX]);
+ } else {
+ bb_reg_set_undef(BBRG_RAX + j);
+ changed = 1;
+ }
+ }
+ }
+
+ /* Merge the memory states. This relies on both
+ * bb_reg_state->memory and state->memory being sorted in
+ * descending order, with undefined entries at the end.
+ */
+ c1 = bb_reg_state->memory;
+ c2 = state->memory;
+ j = k = 0;
+ while (j < bb_reg_state->mem_count &&
+ k < state->mem_count) {
+ if (c1->offset_address < c2->offset_address) {
+ KDB_DEBUG_BB_OFFSET(c2->offset_address,
+ " ignoring c2->offset_address ",
+ "\n");
+ ++c2;
+ ++k;
+ continue;
+ }
+ if (c1->offset_address > c2->offset_address) {
+ /* Memory location is not in all input states,
+ * delete the memory location.
+ */
+ bb_delete_memory(c1->offset_address);
+ changed = 1;
+ ++c1;
+ ++j;
+ continue;
+ }
+ if (memcmp(c1, c2, sizeof(*c1))) {
+ /* Same location, different contents, delete
+ * the memory location.
+ */
+ bb_delete_memory(c1->offset_address);
+ KDB_DEBUG_BB_OFFSET(c2->offset_address,
+ " ignoring c2->offset_address ",
+ "\n");
+ changed = 1;
+ }
+ ++c1;
+ ++c2;
+ ++j;
+ ++k;
+ }
+ while (j < bb_reg_state->mem_count) {
+ bb_delete_memory(c1->offset_address);
+ changed = 1;
+ ++c1;
+ ++j;
+ }
+ }
+ if (changed) {
+ KDB_DEBUG_BB(" final state\n");
+ bb_reg_state_print(bb_reg_state);
+ }
+}
+
+/* We have reached the exit point from the current function, either a call to
+ * the next function or the instruction that was about to executed when an
+ * interrupt occurred. Save the current register state in bb_exit_state.
+ */
+
+static void
+bb_save_exit_state(void)
+{
+ size_t size;
+ debug_kfree(bb_exit_state);
+ bb_exit_state = NULL;
+ bb_reg_state_canonicalize();
+ size = bb_reg_state_size(bb_reg_state);
+ bb_exit_state = debug_kmalloc(size, GFP_ATOMIC);
+ if (!bb_exit_state) {
+ kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ bb_giveup = 1;
+ return;
+ }
+ memcpy(bb_exit_state, bb_reg_state, size);
+}
+
+static int
+bb_pass2_do_changed_blocks(int allow_missing)
+{
+ int i, j, missing, changed, maxloops;
+ unsigned long addr;
+ struct bb_jmp *bb_jmp;
+ KDB_DEBUG_BB("\n %s: allow_missing %d\n", __FUNCTION__, allow_missing);
+ /* Absolute worst case is we have to iterate over all the basic blocks
+ * in an "out of order" state, each iteration losing one register or
+ * memory state. Any more loops than that is a bug. "out of order"
+ * means that the layout of blocks in memory does not match the logic
+ * flow through those blocks so (for example) block 27 comes before
+ * block 2. To allow for out of order blocks, multiply maxloops by the
+ * number of blocks.
+ */
+ maxloops = (KDB_INT_REGISTERS + bb_reg_state_max) * bb_count;
+ changed = 1;
+ do {
+ changed = 0;
+ for (i = 0; i < bb_count; ++i) {
+ bb_curr = bb_list[i];
+ if (!bb_curr->changed)
+ continue;
+ missing = 0;
+ for (j = 0, bb_jmp = bb_jmp_list;
+ j < bb_jmp_count;
+ ++j, ++bb_jmp) {
+ if (bb_jmp->to == bb_curr->start &&
+ !bb_jmp->state)
+ ++missing;
+ }
+ if (missing > allow_missing)
+ continue;
+ bb_curr->changed = 0;
+ changed = 1;
+ KDB_DEBUG_BB("\n bb[%d]\n", i);
+ bb_pass2_start_block(i);
+ for (addr = bb_curr->start;
+ addr <= bb_curr->end; ) {
+ bb_curr_addr = addr;
+ if (addr == bb_exit_addr)
+ bb_save_exit_state();
+ addr += kdba_id_printinsn(addr, &kdb_di);
+ kdb_di.fprintf_func(NULL, "\n");
+ if (bb_giveup)
+ goto done;
+ }
+ if (!bb_exit_state) {
+ /* ATTRIB_NORET functions are a problem with
+ * the current gcc. Allow the trailing address
+ * a bit of leaway.
+ */
+ if (addr == bb_exit_addr ||
+ addr == bb_exit_addr + 1)
+ bb_save_exit_state();
+ }
+ if (bb_curr->drop_through)
+ bb_transfer(bb_curr->end,
+ bb_list[i+1]->start, 1);
+ }
+ if (maxloops-- == 0) {
+ kdb_printf("\n\n%s maxloops reached\n",
+ __FUNCTION__);
+ bb_giveup = 1;
+ goto done;
+ }
+ } while(changed);
+done:
+ for (i = 0; i < bb_count; ++i) {
+ bb_curr = bb_list[i];
+ if (bb_curr->changed)
+ return 1; /* more to do, increase allow_missing */
+ }
+ return 0; /* all blocks done */
+}
+
+/* Assume that the current function is a pass through function that does not
+ * refer to its register parameters. Exclude known asmlinkage functions and
+ * assume the other functions actually use their registers.
+ */
+
+static void
+bb_assume_pass_through(void)
+{
+ static int first_time = 1;
+ if (strncmp(bb_func_name, "sys_", 4) == 0 ||
+ strncmp(bb_func_name, "compat_sys_", 11) == 0 ||
+ strcmp(bb_func_name, "schedule") == 0 ||
+ strcmp(bb_func_name, "do_softirq") == 0 ||
+ strcmp(bb_func_name, "printk") == 0 ||
+ strcmp(bb_func_name, "vprintk") == 0 ||
+ strcmp(bb_func_name, "preempt_schedule") == 0 ||
+ strcmp(bb_func_name, "start_kernel") == 0 ||
+ strcmp(bb_func_name, "csum_partial") == 0 ||
+ strcmp(bb_func_name, "csum_partial_copy_generic") == 0 ||
+ strcmp(bb_func_name, "math_state_restore") == 0 ||
+ strcmp(bb_func_name, "panic") == 0 ||
+ strcmp(bb_func_name, "kdb_printf") == 0 ||
+ strcmp(bb_func_name, "kdb_interrupt") == 0)
+ return;
+ if (bb_asmlinkage_arch())
+ return;
+ bb_reg_params = REGPARM;
+ if (first_time) {
+ kdb_printf(" %s has memory parameters but no register "
+ "parameters.\n Assuming it is a 'pass "
+ "through' function that does not refer to "
+ "its register\n parameters and setting %d "
+ "register parameters\n",
+ bb_func_name, REGPARM);
+ first_time = 0;
+ return;
+ }
+ kdb_printf(" Assuming %s is 'pass through' with %d register "
+ "parameters\n",
+ bb_func_name, REGPARM);
+}
+
+static void
+bb_pass2(void)
+{
+ int allow_missing;
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ kdb_printf("%s: start\n", __FUNCTION__);
+
+ kdb_di.fprintf_func = bb_dis_pass2;
+ kdb_di.print_address_func = bb_printaddr_pass2;
+
+ bb_reg_state = debug_kmalloc(sizeof(*bb_reg_state), GFP_ATOMIC);
+ if (!bb_reg_state) {
+ kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ bb_giveup = 1;
+ return;
+ }
+ bb_list[0]->changed = 1;
+
+ /* If a block does not have all its input states available then it is
+ * possible for a register to initially appear to hold a known value,
+ * but when other inputs are available then it becomes a variable
+ * value. The initial false state of "known" can generate false values
+ * for other registers and can even make it look like stack locations
+ * are being changed.
+ *
+ * To avoid these false positives, only process blocks which have all
+ * their inputs defined. That gives a clean depth first traversal of
+ * the tree, except for loops. If there are any loops, then start
+ * processing blocks with one missing input, then two missing inputs
+ * etc.
+ *
+ * Absolute worst case is we have to iterate over all the jmp entries,
+ * each iteration allowing one more missing input. Any more loops than
+ * that is a bug. Watch out for the corner case of 0 jmp entries.
+ */
+ for (allow_missing = 0; allow_missing <= bb_jmp_count; ++allow_missing) {
+ if (!bb_pass2_do_changed_blocks(allow_missing))
+ break;
+ if (bb_giveup)
+ break;
+ }
+ if (allow_missing > bb_jmp_count) {
+ kdb_printf("\n\n%s maxloops reached\n",
+ __FUNCTION__);
+ bb_giveup = 1;
+ return;
+ }
+
+ if (bb_memory_params && bb_reg_params)
+ bb_reg_params = REGPARM;
+ if (REGPARM &&
+ bb_memory_params &&
+ !bb_reg_params)
+ bb_assume_pass_through();
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
+ kdb_printf("%s: end bb_reg_params %d bb_memory_params %d\n",
+ __FUNCTION__, bb_reg_params, bb_memory_params);
+ if (bb_exit_state) {
+ kdb_printf("%s: bb_exit_state at " kdb_bfd_vma_fmt0 "\n",
+ __FUNCTION__, bb_exit_addr);
+ bb_do_reg_state_print(bb_exit_state);
+ }
+ }
+}
+
+static void
+bb_cleanup(void)
+{
+ int i;
+ struct bb* bb;
+ struct bb_reg_state *state;
+ while (bb_count) {
+ bb = bb_list[0];
+ bb_delete(0);
+ }
+ debug_kfree(bb_list);
+ bb_list = NULL;
+ bb_count = bb_max = 0;
+ for (i = 0; i < bb_jmp_count; ++i) {
+ state = bb_jmp_list[i].state;
+ if (state && --state->ref_count == 0)
+ debug_kfree(state);
+ }
+ debug_kfree(bb_jmp_list);
+ bb_jmp_list = NULL;
+ bb_jmp_count = bb_jmp_max = 0;
+ debug_kfree(bb_reg_state);
+ bb_reg_state = NULL;
+ bb_reg_state_max = 0;
+ debug_kfree(bb_exit_state);
+ bb_exit_state = NULL;
+ bb_reg_params = bb_memory_params = 0;
+ bb_giveup = 0;
+}
+
+static int
+bb_spurious_global_label(const char *func_name)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(bb_spurious); ++i) {
+ if (strcmp(bb_spurious[i], func_name) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+/* Given the current actual register contents plus the exit state deduced from
+ * a basic block analysis of the current function, rollback the actual register
+ * contents to the values they had on entry to this function.
+ */
+
+static void
+bb_actual_rollback(const struct kdb_activation_record *ar)
+{
+ int i, offset_address;
+ struct bb_memory_contains *c;
+ enum bb_reg_code reg;
+ unsigned long address, osp = 0;
+ struct bb_actual new[ARRAY_SIZE(bb_actual)];
+
+
+ if (!bb_exit_state) {
+ kdb_printf("%s: no bb_exit_state, cannot rollback\n",
+ __FUNCTION__);
+ bb_giveup = 1;
+ return;
+ }
+ memcpy(bb_reg_state, bb_exit_state, bb_reg_state_size(bb_exit_state));
+ memset(new, 0, sizeof(new));
+
+ /* The most important register for obtaining saved state is rsp so get
+ * its new value first. Prefer rsp if it is valid, then other
+ * registers. Saved values of rsp in memory are unusable without a
+ * register that points to memory.
+ */
+ if (!bb_actual_valid(BBRG_RSP)) {
+ kdb_printf("%s: no starting value for RSP, cannot rollback\n",
+ __FUNCTION__);
+ bb_giveup = 1;
+ return;
+ }
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ kdb_printf("%s: rsp " kdb_bfd_vma_fmt0,
+ __FUNCTION__, bb_actual_value(BBRG_RSP));
+ i = BBRG_RSP;
+ if (!bb_is_osp_defined(i)) {
+ for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
+ if (bb_is_osp_defined(i) && bb_actual_valid(i))
+ break;
+ }
+ }
+ if (bb_is_osp_defined(i) && bb_actual_valid(i)) {
+ osp = new[BBRG_RSP - BBRG_RAX].value =
+ bb_actual_value(i) - bb_reg_code_offset(i);
+ new[BBRG_RSP - BBRG_RAX].valid = 1;
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ kdb_printf(" -> osp " kdb_bfd_vma_fmt0 "\n", osp);
+ } else {
+ bb_actual_set_valid(BBRG_RSP, 0);
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ kdb_printf(" -> undefined\n");
+ kdb_printf("%s: no ending value for RSP, cannot rollback\n",
+ __FUNCTION__);
+ bb_giveup = 1;
+ return;
+ }
+
+ /* Now the other registers. First look at register values that have
+ * been copied to other registers.
+ */
+ for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
+ reg = bb_reg_code_value(i);
+ if (bb_is_int_reg(reg)) {
+ new[reg - BBRG_RAX] = bb_actual[i - BBRG_RAX];
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
+ kdb_printf("%s: %s is in %s ",
+ __FUNCTION__,
+ bbrg_name[reg],
+ bbrg_name[i]);
+ if (bb_actual_valid(i))
+ kdb_printf(" -> " kdb_bfd_vma_fmt0 "\n",
+ bb_actual_value(i));
+ else
+ kdb_printf("(invalid)\n");
+ }
+ }
+ }
+
+ /* Finally register values that have been saved on stack */
+ for (i = 0, c = bb_reg_state->memory;
+ i < bb_reg_state->mem_count;
+ ++i, ++c) {
+ offset_address = c->offset_address;
+ reg = c->value;
+ if (!bb_is_int_reg(reg))
+ continue;
+ address = osp + offset_address;
+ if (address < ar->stack.logical_start ||
+ address >= ar->stack.logical_end) {
+ new[reg - BBRG_RAX].value = 0;
+ new[reg - BBRG_RAX].valid = 0;
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ kdb_printf("%s: %s -> undefined\n",
+ __FUNCTION__,
+ bbrg_name[reg]);
+ } else {
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
+ kdb_printf("%s: %s -> *(osp",
+ __FUNCTION__,
+ bbrg_name[reg]);
+ KDB_DEBUG_BB_OFFSET_PRINTF(offset_address, "", " ");
+ kdb_printf(kdb_bfd_vma_fmt0, address);
+ }
+ new[reg - BBRG_RAX].value = *(bfd_vma *)address;
+ new[reg - BBRG_RAX].valid = 1;
+ if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ kdb_printf(") = " kdb_bfd_vma_fmt0 "\n",
+ new[reg - BBRG_RAX].value);
+ }
+ }
+
+ memcpy(bb_actual, new, sizeof(bb_actual));
+}
+
+/* Return true if the current function is an interrupt handler */
+
+static bool
+bb_interrupt_handler(kdb_machreg_t rip)
+{
+ unsigned long disp8, disp32, target, addr = (unsigned long)rip;
+ unsigned char code[5];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(bb_hardware_handlers); ++i)
+ if (strcmp(bb_func_name, bb_hardware_handlers[i]) == 0)
+ return 1;
+
+ /* Given the large number of interrupt handlers, it is easiest to look
+ * at the next instruction and see if it is a jmp to the common exit
+ * routines.
+ */
+ if (kdb_getarea(code, addr) ||
+ kdb_getword(&disp32, addr+1, 4) ||
+ kdb_getword(&disp8, addr+1, 1))
+ return 0; /* not a valid code address */
+ if (code[0] == 0xe9) {
+ target = addr + (s32) disp32 + 5; /* jmp disp32 */
+ if (target == bb_ret_from_intr ||
+ target == bb_common_interrupt ||
+ target == bb_error_entry)
+ return 1;
+ }
+ if (code[0] == 0xeb) {
+ target = addr + (s8) disp8 + 2; /* jmp disp8 */
+ if (target == bb_ret_from_intr ||
+ target == bb_common_interrupt ||
+ target == bb_error_entry)
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Copy argument information that was deduced by the basic block analysis and
+ * rollback into the kdb stack activation record.
+ */
+
+static void
+bb_arguments(struct kdb_activation_record *ar)
+{
+ int i;
+ enum bb_reg_code reg;
+ kdb_machreg_t rsp;
+ ar->args = bb_reg_params + bb_memory_params;
+ bitmap_zero(ar->valid.bits, KDBA_MAXARGS);
+ for (i = 0; i < bb_reg_params; ++i) {
+ reg = bb_param_reg[i];
+ if (bb_actual_valid(reg)) {
+ ar->arg[i] = bb_actual_value(reg);
+ set_bit(i, ar->valid.bits);
+ }
+ }
+ if (!bb_actual_valid(BBRG_RSP))
+ return;
+ rsp = bb_actual_value(BBRG_RSP);
+ for (i = bb_reg_params; i < ar->args; ++i) {
+ rsp += KDB_WORD_SIZE;
+ if (kdb_getarea(ar->arg[i], rsp) == 0)
+ set_bit(i, ar->valid.bits);
+ }
+}
+
+/* Given an exit address from a function, decompose the entire function into
+ * basic blocks and determine the register state at the exit point.
+ */
+
+static void
+kdb_bb(unsigned long exit)
+{
+ kdb_symtab_t symtab;
+ if (!kdbnearsym(exit, &symtab)) {
+ kdb_printf("%s: address " kdb_bfd_vma_fmt0 " not recognised\n",
+ __FUNCTION__, exit);
+ bb_giveup = 1;
+ return;
+ }
+ bb_exit_addr = exit;
+ bb_mod_name = symtab.mod_name;
+ bb_func_name = symtab.sym_name;
+ bb_func_start = symtab.sym_start;
+ bb_func_end = symtab.sym_end;
+ /* Various global labels exist in the middle of assembler code and have
+ * a non-standard state. Ignore these labels and use the start of the
+ * previous label instead.
+ */
+ while (bb_spurious_global_label(symtab.sym_name)) {
+ if (!kdbnearsym(symtab.sym_start - 1, &symtab))
+ break;
+ bb_func_start = symtab.sym_start;
+ }
+ bb_mod_name = symtab.mod_name;
+ bb_func_name = symtab.sym_name;
+ bb_func_start = symtab.sym_start;
+ /* Ignore spurious labels past this point and use the next non-spurious
+ * label as the end point.
+ */
+ if (kdbnearsym(bb_func_end, &symtab)) {
+ while (bb_spurious_global_label(symtab.sym_name)) {
+ bb_func_end = symtab.sym_end;
+ if (!kdbnearsym(symtab.sym_end + 1, &symtab))
+ break;
+ }
+ }
+ bb_pass1();
+ if (!bb_giveup)
+ bb_pass2();
+ if (bb_giveup)
+ kdb_printf("%s: " kdb_bfd_vma_fmt0
+ " [%s]%s failed at " kdb_bfd_vma_fmt0 "\n\n",
+ __FUNCTION__, exit,
+ bb_mod_name, bb_func_name, bb_curr_addr);
+}
+
+static int
+kdb_bb1(int argc, const char **argv)
+{
+ int diag, nextarg = 1;
+ kdb_machreg_t addr;
+ unsigned long offset;
+
+ bb_cleanup(); /* in case previous command was interrupted */
+ kdba_id_init(&kdb_di);
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ if (!addr)
+ return KDB_BADADDR;
+ kdb_save_flags();
+ kdb_flags |= KDB_DEBUG_FLAG_BB << KDB_DEBUG_FLAG_SHIFT;
+ kdb_bb(addr);
+ bb_cleanup();
+ kdb_restore_flags();
+ kdbnearsym_cleanup();
+ return 0;
+}
+
+/* Run a basic block analysis on every function in the base kernel. Used as a
+ * global sanity check to find errors in the basic block code.
+ */
+
+static int
+kdb_bb_all(int argc, const char **argv)
+{
+ loff_t pos = 0;
+ const char *symname;
+ unsigned long addr;
+ int i, max_errors = 20;
+ struct bb_name_state *r;
+ kdb_printf("%s: build variables:"
+ " CCVERSION \"" __stringify(CCVERSION) "\""
+#ifdef CONFIG_X86_64
+ " CONFIG_X86_64"
+#endif
+#ifdef CONFIG_4KSTACKS
+ " CONFIG_4KSTACKS"
+#endif
+#ifdef CONFIG_PREEMPT
+ " CONFIG_PREEMPT"
+#endif
+#ifdef CONFIG_VM86
+ " CONFIG_VM86"
+#endif
+#ifdef CONFIG_FRAME_POINTER
+ " CONFIG_FRAME_POINTER"
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+ " CONFIG_TRACE_IRQFLAGS"
+#endif
+#ifdef CONFIG_HIBERNATION
+ " CONFIG_HIBERNATION"
+#endif
+#ifdef CONFIG_KPROBES
+ " CONFIG_KPROBES"
+#endif
+#ifdef CONFIG_KEXEC
+ " CONFIG_KEXEC"
+#endif
+#ifdef CONFIG_MATH_EMULATION
+ " CONFIG_MATH_EMULATION"
+#endif
- #ifdef CONFIG_PARAVIRT_XEN
++#ifdef CONFIG_XEN
+ " CONFIG_XEN"
+#endif
+#ifdef CONFIG_DEBUG_INFO
+ " CONFIG_DEBUG_INFO"
+#endif
+#ifdef NO_SIBLINGS
+ " NO_SIBLINGS"
+#endif
+ " REGPARM=" __stringify(REGPARM)
+ "\n\n", __FUNCTION__);
+ for (i = 0, r = bb_special_cases;
+ i < ARRAY_SIZE(bb_special_cases);
+ ++i, ++r) {
+ if (!r->address)
+ kdb_printf("%s: cannot find special_case name %s\n",
+ __FUNCTION__, r->name);
+ }
+ for (i = 0; i < ARRAY_SIZE(bb_spurious); ++i) {
+ if (!kallsyms_lookup_name(bb_spurious[i]))
+ kdb_printf("%s: cannot find spurious label %s\n",
+ __FUNCTION__, bb_spurious[i]);
+ }
+ while ((symname = kdb_walk_kallsyms(&pos))) {
+ if (strcmp(symname, "_stext") == 0 ||
+ strcmp(symname, "stext") == 0)
+ break;
+ }
+ if (!symname) {
+ kdb_printf("%s: cannot find _stext\n", __FUNCTION__);
+ return 0;
+ }
+ kdba_id_init(&kdb_di);
+ i = 0;
+ while ((symname = kdb_walk_kallsyms(&pos))) {
+ if (strcmp(symname, "_etext") == 0)
+ break;
+ if (i++ % 100 == 0)
+ kdb_printf(".");
+ /* x86_64 has some 16 bit functions that appear between stext
+ * and _etext. Skip them.
+ */
+ if (strcmp(symname, "verify_cpu") == 0 ||
+ strcmp(symname, "verify_cpu_noamd") == 0 ||
+ strcmp(symname, "verify_cpu_sse_test") == 0 ||
+ strcmp(symname, "verify_cpu_no_longmode") == 0 ||
+ strcmp(symname, "verify_cpu_sse_ok") == 0 ||
+ strcmp(symname, "mode_seta") == 0 ||
+ strcmp(symname, "bad_address") == 0 ||
+ strcmp(symname, "wakeup_code") == 0 ||
+ strcmp(symname, "wakeup_code_start") == 0 ||
+ strcmp(symname, "wakeup_start") == 0 ||
+ strcmp(symname, "wakeup_32_vector") == 0 ||
+ strcmp(symname, "wakeup_32") == 0 ||
+ strcmp(symname, "wakeup_long64_vector") == 0 ||
+ strcmp(symname, "wakeup_long64") == 0 ||
+ strcmp(symname, "gdta") == 0 ||
+ strcmp(symname, "idt_48a") == 0 ||
+ strcmp(symname, "gdt_48a") == 0 ||
+ strcmp(symname, "bogus_real_magic") == 0 ||
+ strcmp(symname, "bogus_64_magic") == 0 ||
+ strcmp(symname, "no_longmode") == 0 ||
+ strcmp(symname, "mode_set") == 0 ||
+ strcmp(symname, "mode_seta") == 0 ||
+ strcmp(symname, "setbada") == 0 ||
+ strcmp(symname, "check_vesa") == 0 ||
+ strcmp(symname, "check_vesaa") == 0 ||
+ strcmp(symname, "_setbada") == 0 ||
+ strcmp(symname, "wakeup_stack_begin") == 0 ||
+ strcmp(symname, "wakeup_stack") == 0 ||
+ strcmp(symname, "wakeup_level4_pgt") == 0 ||
+ strcmp(symname, "acpi_copy_wakeup_routine") == 0 ||
+ strcmp(symname, "wakeup_end") == 0 ||
+ strcmp(symname, "do_suspend_lowlevel_s4bios") == 0 ||
+ strcmp(symname, "do_suspend_lowlevel") == 0 ||
+ strcmp(symname, "wakeup_pmode_return") == 0 ||
+ strcmp(symname, "restore_registers") == 0)
+ continue;
+ /* __kprobes_text_end contains branches to the middle of code,
+ * with undefined states.
+ */
+ if (strcmp(symname, "__kprobes_text_end") == 0)
+ continue;
+ /* Data in the middle of the text segment :( */
+ if (strcmp(symname, "level2_kernel_pgt") == 0 ||
+ strcmp(symname, "level3_kernel_pgt") == 0)
+ continue;
+ if (bb_spurious_global_label(symname))
+ continue;
+ if ((addr = kallsyms_lookup_name(symname)) == 0)
+ continue;
+ // kdb_printf("BB " kdb_bfd_vma_fmt0 " %s\n", addr, symname);
+ bb_cleanup(); /* in case previous command was interrupted */
+ kdbnearsym_cleanup();
+ kdb_bb(addr);
+ touch_nmi_watchdog();
+ if (bb_giveup) {
+ if (max_errors-- == 0) {
+ kdb_printf("%s: max_errors reached, giving up\n",
+ __FUNCTION__);
+ break;
+ } else {
+ bb_giveup = 0;
+ }
+ }
+ }
+ kdb_printf("\n");
+ bb_cleanup();
+ kdbnearsym_cleanup();
+ return 0;
+}
+
+/*
+ *=============================================================================
+ *
+ * Everything above this line is doing basic block analysis, function by
+ * function. Everything below this line uses the basic block data to do a
+ * complete backtrace over all functions that are used by a process.
+ *
+ *=============================================================================
+ */
+
+
+/*============================================================================*/
+/* */
+/* Most of the backtrace code and data is common to x86_64 and i386. This */
+/* large ifdef contains all of the differences between the two architectures. */
+/* */
+/* Make sure you update the correct section of this ifdef. */
+/* */
+/*============================================================================*/
+#define XCS "cs"
+#define RSP "sp"
+#define RIP "ip"
+#define ARCH_RSP sp
+#define ARCH_RIP ip
+
+#ifdef CONFIG_X86_64
+
+#define ARCH_NORMAL_PADDING (16 * 8)
+
+/* x86_64 has multiple alternate stacks, with different sizes and different
+ * offsets to get the link from one stack to the next. All of the stacks are
+ * in the per_cpu area: either in the orig_ist or irq_stack_ptr. Debug events
+ * can even have multiple nested stacks within the single physical stack,
+ * each nested stack has its own link and some of those links are wrong.
+ *
+ * Consistent it's not!
+ *
+ * Do not assume that these stacks are aligned on their size.
+ */
+#define INTERRUPT_STACK (N_EXCEPTION_STACKS + 1)
+void
+kdba_get_stack_info_alternate(kdb_machreg_t addr, int cpu,
+ struct kdb_activation_record *ar)
+{
+ static struct {
+ const char *id;
+ unsigned int total_size;
+ unsigned int nested_size;
+ unsigned int next;
+ } *sdp, stack_data[] = {
+ [STACKFAULT_STACK - 1] = { "stackfault", EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ [DOUBLEFAULT_STACK - 1] = { "doublefault", EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ [NMI_STACK - 1] = { "nmi", EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ [DEBUG_STACK - 1] = { "debug", DEBUG_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ [MCE_STACK - 1] = { "machine check", EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ [INTERRUPT_STACK - 1] = { "interrupt", IRQ_STACK_SIZE, IRQ_STACK_SIZE, IRQ_STACK_SIZE - sizeof(void *) },
+ };
+ unsigned long total_start = 0, total_size, total_end;
+ int sd, found = 0;
+ extern unsigned long kdba_orig_ist(int, int);
+
+ for (sd = 0, sdp = stack_data;
+ sd < ARRAY_SIZE(stack_data);
+ ++sd, ++sdp) {
+ total_size = sdp->total_size;
+ if (!total_size)
+ continue; /* in case stack_data[] has any holes */
+ if (cpu < 0) {
+ /* Arbitrary address which can be on any cpu, see if it
+ * falls within any of the alternate stacks
+ */
+ int c;
+ for_each_online_cpu(c) {
+ if (sd == INTERRUPT_STACK - 1)
+ total_end = (unsigned long)per_cpu(irq_stack_ptr, c);
+ else
+ total_end = per_cpu(orig_ist, c).ist[sd];
+ total_start = total_end - total_size;
+ if (addr >= total_start && addr < total_end) {
+ found = 1;
+ cpu = c;
+ break;
+ }
+ }
+ if (!found)
+ continue;
+ }
+ /* Only check the supplied or found cpu */
+ if (sd == INTERRUPT_STACK - 1)
+ total_end = (unsigned long)per_cpu(irq_stack_ptr, cpu);
+ else
+ total_end = per_cpu(orig_ist, cpu).ist[sd];
+ total_start = total_end - total_size;
+ if (addr >= total_start && addr < total_end) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return;
+ /* find which nested stack the address is in */
+ while (addr > total_start + sdp->nested_size)
+ total_start += sdp->nested_size;
+ ar->stack.physical_start = total_start;
+ ar->stack.physical_end = total_start + sdp->nested_size;
+ ar->stack.logical_start = total_start;
+ ar->stack.logical_end = total_start + sdp->next;
+ ar->stack.next = *(unsigned long *)ar->stack.logical_end;
+ ar->stack.id = sdp->id;
+
+ /* Nasty: when switching to the interrupt stack, the stack state of the
+ * caller is split over two stacks, the original stack and the
+ * interrupt stack. One word (the previous frame pointer) is stored on
+ * the interrupt stack, the rest of the interrupt data is in the old
+ * frame. To make the interrupted stack state look as though it is
+ * contiguous, copy the missing word from the interrupt stack to the
+ * original stack and adjust the new stack pointer accordingly.
+ */
+
+ if (sd == INTERRUPT_STACK - 1) {
+ *(unsigned long *)(ar->stack.next - KDB_WORD_SIZE) =
+ ar->stack.next;
+ ar->stack.next -= KDB_WORD_SIZE;
+ }
+}
+
+/* rip is not in the thread struct for x86_64. We know that the stack value
+ * was saved in schedule near the label thread_return. Setting rip to
+ * thread_return lets the stack trace find that we are in schedule and
+ * correctly decode its prologue.
+ */
+
+static kdb_machreg_t
+kdba_bt_stack_rip(const struct task_struct *p)
+{
+ return bb_thread_return;
+}
+
+#else /* !CONFIG_X86_64 */
+
+#define ARCH_NORMAL_PADDING (19 * 4)
+
+#ifdef CONFIG_4KSTACKS
+static struct thread_info **kdba_hardirq_ctx, **kdba_softirq_ctx;
+#endif /* CONFIG_4KSTACKS */
+
+/* On a 4K stack kernel, hardirq_ctx and softirq_ctx are [NR_CPUS] arrays. The
+ * first element of each per-cpu stack is a struct thread_info.
+ */
+void
+kdba_get_stack_info_alternate(kdb_machreg_t addr, int cpu,
+ struct kdb_activation_record *ar)
+{
+#ifdef CONFIG_4KSTACKS
+ struct thread_info *tinfo;
+ tinfo = (struct thread_info *)(addr & -THREAD_SIZE);
+ if (cpu < 0) {
+ /* Arbitrary address, see if it falls within any of the irq
+ * stacks
+ */
+ int found = 0;
+ for_each_online_cpu(cpu) {
+ if (tinfo == kdba_hardirq_ctx[cpu] ||
+ tinfo == kdba_softirq_ctx[cpu]) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return;
+ }
+ if (tinfo == kdba_hardirq_ctx[cpu] ||
+ tinfo == kdba_softirq_ctx[cpu]) {
+ ar->stack.physical_start = (kdb_machreg_t)tinfo;
+ ar->stack.physical_end = ar->stack.physical_start + THREAD_SIZE;
+ ar->stack.logical_start = ar->stack.physical_start +
+ sizeof(struct thread_info);
+ ar->stack.logical_end = ar->stack.physical_end;
+ ar->stack.next = tinfo->previous_esp;
+ if (tinfo == kdba_hardirq_ctx[cpu])
+ ar->stack.id = "hardirq_ctx";
+ else
+ ar->stack.id = "softirq_ctx";
+ }
+#endif /* CONFIG_4KSTACKS */
+}
+
+/* rip is in the thread struct for i386 */
+
+static kdb_machreg_t
+kdba_bt_stack_rip(const struct task_struct *p)
+{
+ return p->thread.ip;
+}
+
+#endif /* CONFIG_X86_64 */
+
+/* Given an address which claims to be on a stack, an optional cpu number and
+ * an optional task address, get information about the stack.
+ *
+ * t == NULL, cpu < 0 indicates an arbitrary stack address with no associated
+ * struct task, the address can be in an alternate stack or any task's normal
+ * stack.
+ *
+ * t != NULL, cpu >= 0 indicates a running task, the address can be in an
+ * alternate stack or that task's normal stack.
+ *
+ * t != NULL, cpu < 0 indicates a blocked task, the address can only be in that
+ * task's normal stack.
+ *
+ * t == NULL, cpu >= 0 is not a valid combination.
+ */
+
+static void
+kdba_get_stack_info(kdb_machreg_t rsp, int cpu,
+ struct kdb_activation_record *ar,
+ const struct task_struct *t)
+{
+ struct thread_info *tinfo;
+ struct task_struct *g, *p;
+ memset(&ar->stack, 0, sizeof(ar->stack));
+ if (KDB_DEBUG(ARA))
+ kdb_printf("%s: " RSP "=0x%lx cpu=%d task=%p\n",
+ __FUNCTION__, rsp, cpu, t);
+ if (t == NULL || cpu >= 0) {
+ kdba_get_stack_info_alternate(rsp, cpu, ar);
+ if (ar->stack.logical_start)
+ goto out;
+ }
+ rsp &= -THREAD_SIZE;
+ tinfo = (struct thread_info *)rsp;
+ if (t == NULL) {
+ /* Arbitrary stack address without an associated task, see if
+ * it falls within any normal process stack, including the idle
+ * tasks.
+ */
+ kdb_do_each_thread(g, p) {
+ if (tinfo == task_thread_info(p)) {
+ t = p;
+ goto found;
+ }
+ } kdb_while_each_thread(g, p);
+ for_each_online_cpu(cpu) {
+ p = idle_task(cpu);
+ if (tinfo == task_thread_info(p)) {
+ t = p;
+ goto found;
+ }
+ }
+ found:
+ if (KDB_DEBUG(ARA))
+ kdb_printf("%s: found task %p\n", __FUNCTION__, t);
+ } else if (cpu >= 0) {
+ /* running task */
+ struct kdb_running_process *krp = kdb_running_process + cpu;
+ if (krp->p != t || tinfo != task_thread_info(t))
+ t = NULL;
+ if (KDB_DEBUG(ARA))
+ kdb_printf("%s: running task %p\n", __FUNCTION__, t);
+ } else {
+ /* blocked task */
+ if (tinfo != task_thread_info(t))
+ t = NULL;
+ if (KDB_DEBUG(ARA))
+ kdb_printf("%s: blocked task %p\n", __FUNCTION__, t);
+ }
+ if (t) {
+ ar->stack.physical_start = rsp;
+ ar->stack.physical_end = rsp + THREAD_SIZE;
+ ar->stack.logical_start = rsp + sizeof(struct thread_info);
+ ar->stack.logical_end = ar->stack.physical_end - ARCH_NORMAL_PADDING;
+ ar->stack.next = 0;
+ ar->stack.id = "normal";
+ }
+out:
+ if (ar->stack.physical_start && KDB_DEBUG(ARA)) {
+ kdb_printf("%s: ar->stack\n", __FUNCTION__);
+ kdb_printf(" physical_start=0x%lx\n", ar->stack.physical_start);
+ kdb_printf(" physical_end=0x%lx\n", ar->stack.physical_end);
+ kdb_printf(" logical_start=0x%lx\n", ar->stack.logical_start);
+ kdb_printf(" logical_end=0x%lx\n", ar->stack.logical_end);
+ kdb_printf(" next=0x%lx\n", ar->stack.next);
+ kdb_printf(" id=%s\n", ar->stack.id);
+ kdb_printf(" set MDCOUNT %ld\n",
+ (ar->stack.physical_end - ar->stack.physical_start) /
+ KDB_WORD_SIZE);
+ kdb_printf(" mds " kdb_machreg_fmt0 "\n",
+ ar->stack.physical_start);
+ }
+}
+
+static void
+bt_print_one(kdb_machreg_t rip, kdb_machreg_t rsp,
+ const struct kdb_activation_record *ar,
+ const kdb_symtab_t *symtab, int argcount)
+{
+ int btsymarg = 0;
+ int nosect = 0;
+
+ kdbgetintenv("BTSYMARG", &btsymarg);
+ kdbgetintenv("NOSECT", &nosect);
+
+ kdb_printf(kdb_machreg_fmt0, rsp);
+ kdb_symbol_print(rip, symtab,
+ KDB_SP_SPACEB|KDB_SP_VALUE);
+ if (argcount && ar->args) {
+ int i, argc = ar->args;
+ kdb_printf(" (");
+ if (argc > argcount)
+ argc = argcount;
+ for (i = 0; i < argc; i++) {
+ if (i)
+ kdb_printf(", ");
+ if (test_bit(i, ar->valid.bits))
+ kdb_printf("0x%lx", ar->arg[i]);
+ else
+ kdb_printf("invalid");
+ }
+ kdb_printf(")");
+ }
+ kdb_printf("\n");
+ if (symtab->sym_name) {
+ if (!nosect) {
+ kdb_printf(" %s",
+ symtab->mod_name);
+ if (symtab->sec_name && symtab->sec_start)
+ kdb_printf(" 0x%lx 0x%lx",
+ symtab->sec_start, symtab->sec_end);
+ kdb_printf(" 0x%lx 0x%lx\n",
+ symtab->sym_start, symtab->sym_end);
+ }
+ }
+ if (argcount && ar->args && btsymarg) {
+ int i, argc = ar->args;
+ kdb_symtab_t arg_symtab;
+ for (i = 0; i < argc; i++) {
+ kdb_machreg_t arg = ar->arg[i];
+ if (test_bit(i, ar->valid.bits) &&
+ kdbnearsym(arg, &arg_symtab)) {
+ kdb_printf(" ARG %2d ", i);
+ kdb_symbol_print(arg, &arg_symtab,
+ KDB_SP_DEFAULT|KDB_SP_NEWLINE);
+ }
+ }
+ }
+}
+
+static void
+kdba_bt_new_stack(struct kdb_activation_record *ar, kdb_machreg_t *rsp,
+ int *count, int *suppress)
+{
+ /* Nasty: save_args builds a partial pt_regs, with r15 through
+ * rbx not being filled in. It passes struct pt_regs* to do_IRQ (in
+ * rdi) but the stack pointer is not adjusted to account for r15
+ * through rbx. This has two effects :-
+ *
+ * (1) struct pt_regs on an external interrupt actually overlaps with
+ * the local stack area used by do_IRQ. Not only are r15-rbx
+ * undefined, the area that claims to hold their values can even
+ * change as the irq is processed.
+ *
+ * (2) The back stack pointer saved for the new frame is not pointing
+ * at pt_regs, it is pointing at rbx within the pt_regs passed to
+ * do_IRQ.
+ *
+ * There is nothing that I can do about (1) but I have to fix (2)
+ * because kdb backtrace looks for the "start" address of pt_regs as it
+ * walks back through the stacks. When switching from the interrupt
+ * stack to another stack, we have to assume that pt_regs has been
+ * seen and turn off backtrace supression.
+ */
+ int probable_pt_regs = strcmp(ar->stack.id, "interrupt") == 0;
+ *rsp = ar->stack.next;
+ if (KDB_DEBUG(ARA))
+ kdb_printf("new " RSP "=" kdb_machreg_fmt0 "\n", *rsp);
+ bb_actual_set_value(BBRG_RSP, *rsp);
+ kdba_get_stack_info(*rsp, -1, ar, NULL);
+ if (!ar->stack.physical_start) {
+ kdb_printf("+++ Cannot resolve next stack\n");
+ } else if (!*suppress) {
+ kdb_printf(" ======================= <%s>\n",
+ ar->stack.id);
+ ++*count;
+ }
+ if (probable_pt_regs)
+ *suppress = 0;
+}
+
+/*
+ * kdba_bt_stack
+ *
+ * Inputs:
+ * addr Address provided to 'bt' command, if any.
+ * argcount
+ * p Pointer to task for 'btp' command.
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * Ultimately all the bt* commands come through this routine. If
+ * old_style is 0 then it uses the basic block analysis to get an accurate
+ * backtrace with arguments, otherwise it falls back to the old method of
+ * printing anything on stack that looks like a kernel address.
+ *
+ * Allowing for the stack data pushed by the hardware is tricky. We
+ * deduce the presence of hardware pushed data by looking for interrupt
+ * handlers, either by name or by the code that they contain. This
+ * information must be applied to the next function up the stack, because
+ * the hardware data is above the saved rip for the interrupted (next)
+ * function.
+ *
+ * To make things worse, the amount of data pushed is arch specific and
+ * may depend on the rsp for the next function, not the current function.
+ * The number of bytes pushed by hardware cannot be calculated until we
+ * are actually processing the stack for the interrupted function and have
+ * its rsp.
+ *
+ * It is also possible for an interrupt to occur in user space and for the
+ * interrupt handler to also be interrupted. Check the code selector
+ * whenever the previous function is an interrupt handler and stop
+ * backtracing if the interrupt was not in kernel space.
+ */
+
+static int
+kdba_bt_stack(kdb_machreg_t addr, int argcount, const struct task_struct *p,
+ int old_style)
+{
+ struct kdb_activation_record ar;
+ kdb_machreg_t rip = 0, rsp = 0, prev_rsp, cs;
+ kdb_symtab_t symtab;
+ int rip_at_rsp = 0, count = 0, btsp = 0, suppress,
+ interrupt_handler = 0, prev_interrupt_handler = 0, hardware_pushed,
+ prev_noret = 0;
+ struct pt_regs *regs = NULL;
+
+ kdbgetintenv("BTSP", &btsp);
+ suppress = !btsp;
+ memset(&ar, 0, sizeof(ar));
+ if (old_style)
+ kdb_printf("Using old style backtrace, unreliable with no arguments\n");
+
+ /*
+ * The caller may have supplied an address at which the stack traceback
+ * operation should begin. This address is assumed by this code to
+ * point to a return address on the stack to be traced back.
+ *
+ * Warning: type in the wrong address and you will get garbage in the
+ * backtrace.
+ */
+ if (addr) {
+ rsp = addr;
+ kdb_getword(&rip, rsp, sizeof(rip));
+ rip_at_rsp = 1;
+ suppress = 0;
+ kdba_get_stack_info(rsp, -1, &ar, NULL);
+ } else {
+ if (task_curr(p)) {
+ struct kdb_running_process *krp =
+ kdb_running_process + task_cpu(p);
+ kdb_machreg_t cs;
+ regs = krp->regs;
+ if (krp->seqno &&
+ krp->p == p &&
+ krp->seqno >= kdb_seqno - 1 &&
+ !KDB_NULL_REGS(regs)) {
+ /* valid saved state, continue processing */
+ } else {
+ kdb_printf
+ ("Process did not save state, cannot backtrace\n");
+ kdb_ps1(p);
+ return 0;
+ }
+ kdba_getregcontents(XCS, regs, &cs);
+ if ((cs & 0xffff) != __KERNEL_CS) {
+ kdb_printf("Stack is not in kernel space, backtrace not available\n");
+ return 0;
+ }
+ rip = krp->arch.ARCH_RIP;
+ rsp = krp->arch.ARCH_RSP;
+ kdba_get_stack_info(rsp, kdb_process_cpu(p), &ar, p);
+ } else {
+ /* Not on cpu, assume blocked. Blocked tasks do not
+ * have pt_regs. p->thread contains some data, alas
+ * what it contains differs between i386 and x86_64.
+ */
+ rip = kdba_bt_stack_rip(p);
+ rsp = p->thread.sp;
+ suppress = 0;
+ kdba_get_stack_info(rsp, -1, &ar, p);
+ }
+ }
+ if (!ar.stack.physical_start) {
+ kdb_printf(RSP "=0x%lx is not in a valid kernel stack, backtrace not available\n",
+ rsp);
+ return 0;
+ }
+ memset(&bb_actual, 0, sizeof(bb_actual));
+ bb_actual_set_value(BBRG_RSP, rsp);
+ bb_actual_set_valid(BBRG_RSP, 1);
+
+ kdb_printf(RSP "%*s" RIP "%*sFunction (args)\n",
+ 2*KDB_WORD_SIZE, " ",
+ 2*KDB_WORD_SIZE, " ");
+ if (ar.stack.next && !suppress)
+ kdb_printf(" ======================= <%s>\n",
+ ar.stack.id);
+
+ bb_cleanup();
+ /* Run through all the stacks */
+ while (ar.stack.physical_start) {
+ if (rip_at_rsp) {
+ rip = *(kdb_machreg_t *)rsp;
+ /* I wish that gcc was fixed to include a nop
+ * instruction after ATTRIB_NORET functions. The lack
+ * of a nop means that the return address points to the
+ * start of next function, so fudge it to point to one
+ * byte previous.
+ *
+ * No, we cannot just decrement all rip values.
+ * Sometimes an rip legally points to the start of a
+ * function, e.g. interrupted code or hand crafted
+ * assembler.
+ */
+ if (prev_noret) {
+ kdbnearsym(rip, &symtab);
+ if (rip == symtab.sym_start) {
+ --rip;
+ if (KDB_DEBUG(ARA))
+ kdb_printf("\tprev_noret, " RIP
+ "=0x%lx\n", rip);
+ }
+ }
+ }
+ kdbnearsym(rip, &symtab);
+ if (old_style) {
+ if (__kernel_text_address(rip) && !suppress) {
+ bt_print_one(rip, rsp, &ar, &symtab, 0);
+ ++count;
+ }
+ if (rsp == (unsigned long)regs) {
+ if (ar.stack.next && suppress)
+ kdb_printf(" ======================= <%s>\n",
+ ar.stack.id);
+ ++count;
+ suppress = 0;
+ }
+ rsp += sizeof(rip);
+ rip_at_rsp = 1;
+ if (rsp >= ar.stack.logical_end) {
+ if (!ar.stack.next)
+ break;
+ kdba_bt_new_stack(&ar, &rsp, &count, &suppress);
+ rip_at_rsp = 0;
+ continue;
+ }
+ } else {
+ /* Start each analysis with no dynamic data from the
+ * previous kdb_bb() run.
+ */
+ bb_cleanup();
+ kdb_bb(rip);
+ if (bb_giveup)
+ break;
+ prev_interrupt_handler = interrupt_handler;
+ interrupt_handler = bb_interrupt_handler(rip);
+ prev_rsp = rsp;
+ if (rip_at_rsp) {
+ if (prev_interrupt_handler) {
+ cs = *((kdb_machreg_t *)rsp + 1) & 0xffff;
+ hardware_pushed =
+ bb_hardware_pushed_arch(rsp, &ar);
+ } else {
+ cs = __KERNEL_CS;
+ hardware_pushed = 0;
+ }
+ rsp += sizeof(rip) + hardware_pushed;
+ if (KDB_DEBUG(ARA))
+ kdb_printf("%s: " RSP " "
+ kdb_machreg_fmt0
+ " -> " kdb_machreg_fmt0
+ " hardware_pushed %d"
+ " prev_interrupt_handler %d"
+ " cs 0x%lx\n",
+ __FUNCTION__,
+ prev_rsp,
+ rsp,
+ hardware_pushed,
+ prev_interrupt_handler,
+ cs);
+ if (rsp >= ar.stack.logical_end &&
+ ar.stack.next) {
+ kdba_bt_new_stack(&ar, &rsp, &count,
+ &suppress);
+ rip_at_rsp = 0;
+ continue;
+ }
+ bb_actual_set_value(BBRG_RSP, rsp);
+ } else {
+ cs = __KERNEL_CS;
+ }
+ rip_at_rsp = 1;
+ bb_actual_rollback(&ar);
+ if (bb_giveup)
+ break;
+ if (bb_actual_value(BBRG_RSP) < rsp) {
+ kdb_printf("%s: " RSP " is going backwards, "
+ kdb_machreg_fmt0 " -> "
+ kdb_machreg_fmt0 "\n",
+ __FUNCTION__,
+ rsp,
+ bb_actual_value(BBRG_RSP));
+ bb_giveup = 1;
+ break;
+ }
+ bb_arguments(&ar);
+ if (!suppress) {
+ bt_print_one(rip, prev_rsp, &ar, &symtab, argcount);
+ ++count;
+ }
+ /* Functions that terminate the backtrace */
+ if (strcmp(bb_func_name, "cpu_idle") == 0 ||
+ strcmp(bb_func_name, "child_rip") == 0)
+ break;
+ if (rsp >= ar.stack.logical_end &&
+ !ar.stack.next)
+ break;
+ if (rsp <= (unsigned long)regs &&
+ bb_actual_value(BBRG_RSP) > (unsigned long)regs) {
+ if (ar.stack.next && suppress)
+ kdb_printf(" ======================= <%s>\n",
+ ar.stack.id);
+ ++count;
+ suppress = 0;
+ }
+ if (cs != __KERNEL_CS) {
+ kdb_printf("Reached user space\n");
+ break;
+ }
+ rsp = bb_actual_value(BBRG_RSP);
+ }
+ prev_noret = bb_noret(bb_func_name);
+ if (count > 200)
+ break;
+ }
+ if (bb_giveup)
+ return 1;
+ bb_cleanup();
+ kdbnearsym_cleanup();
+
+ if (count > 200) {
+ kdb_printf("bt truncated, count limit reached\n");
+ return 1;
+ } else if (suppress) {
+ kdb_printf
+ ("bt did not find pt_regs - no trace produced. Suggest 'set BTSP 1'\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * kdba_bt_address
+ *
+ * Do a backtrace starting at a specified stack address. Use this if the
+ * heuristics get the stack decode wrong.
+ *
+ * Inputs:
+ * addr Address provided to 'bt' command.
+ * argcount
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * mds %rsp comes in handy when examining the stack to do a manual
+ * traceback.
+ */
+
+int kdba_bt_address(kdb_machreg_t addr, int argcount)
+{
+ int ret;
+ kdba_id_init(&kdb_di); /* kdb_bb needs this done once */
+ ret = kdba_bt_stack(addr, argcount, NULL, 0);
+ if (ret == 1)
+ ret = kdba_bt_stack(addr, argcount, NULL, 1);
+ return ret;
+}
+
+/*
+ * kdba_bt_process
+ *
+ * Do a backtrace for a specified process.
+ *
+ * Inputs:
+ * p Struct task pointer extracted by 'bt' command.
+ * argcount
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ */
+
+int kdba_bt_process(const struct task_struct *p, int argcount)
+{
+ int ret;
+ kdba_id_init(&kdb_di); /* kdb_bb needs this done once */
+ ret = kdba_bt_stack(0, argcount, p, 0);
+ if (ret == 1)
+ ret = kdba_bt_stack(0, argcount, p, 1);
+ return ret;
+}
+
+static int __init kdba_bt_x86_init(void)
+{
+ int i, c, cp = -1;
+ struct bb_name_state *r;
+
+ kdb_register_repeat("bb1", kdb_bb1, "<vaddr>", "Analyse one basic block", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bb_all", kdb_bb_all, "", "Backtrace check on all built in functions", 0, KDB_REPEAT_NONE);
+
+ /* Split the opcode usage table by the first letter of each set of
+ * opcodes, for faster mapping of opcode to its operand usage.
+ */
+ for (i = 0; i < ARRAY_SIZE(bb_opcode_usage_all); ++i) {
+ c = bb_opcode_usage_all[i].opcode[0] - 'a';
+ if (c != cp) {
+ cp = c;
+ bb_opcode_usage[c].opcode = bb_opcode_usage_all + i;
+ }
+ ++bb_opcode_usage[c].size;
+ }
+
+ bb_common_interrupt = kallsyms_lookup_name("common_interrupt");
+ bb_error_entry = kallsyms_lookup_name("error_entry");
+ bb_ret_from_intr = kallsyms_lookup_name("ret_from_intr");
+ bb_thread_return = kallsyms_lookup_name("thread_return");
+ bb_sync_regs = kallsyms_lookup_name("sync_regs");
+ bb_save_v86_state = kallsyms_lookup_name("save_v86_state");
+ bb__sched_text_start = kallsyms_lookup_name("__sched_text_start");
+ bb__sched_text_end = kallsyms_lookup_name("__sched_text_end");
+ bb_save_args = kallsyms_lookup_name("save_args");
+ bb_save_rest = kallsyms_lookup_name("save_rest");
+ bb_save_paranoid = kallsyms_lookup_name("save_paranoid");
+ for (i = 0, r = bb_special_cases;
+ i < ARRAY_SIZE(bb_special_cases);
+ ++i, ++r) {
+ r->address = kallsyms_lookup_name(r->name);
+ }
+
+#ifdef CONFIG_4KSTACKS
+ kdba_hardirq_ctx = (struct thread_info **)kallsyms_lookup_name("hardirq_ctx");
+ kdba_softirq_ctx = (struct thread_info **)kallsyms_lookup_name("softirq_ctx");
+#endif /* CONFIG_4KSTACKS */
+
+ return 0;
+}
+
+static void __exit kdba_bt_x86_exit(void)
+{
+ kdb_unregister("bb1");
+ kdb_unregister("bb_all");
+}
+
+module_init(kdba_bt_x86_init)
+module_exit(kdba_bt_x86_exit)
--- /dev/null
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2008 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+#include <linux/string.h>
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/ptrace.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/kdb.h>
+#include <linux/kdbprivate.h>
+#include <linux/interrupt.h>
+#include <linux/kdebug.h>
+#include <linux/cpumask.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+
+static kdb_machreg_t
+kdba_getcr(int regnum)
+{
+ kdb_machreg_t contents = 0;
+ switch(regnum) {
+ case 0:
+ __asm__ (_ASM_MOV " %%cr0,%0\n\t":"=r"(contents));
+ break;
+ case 1:
+ break;
+ case 2:
+ __asm__ (_ASM_MOV " %%cr2,%0\n\t":"=r"(contents));
+ break;
+ case 3:
+ __asm__ (_ASM_MOV " %%cr3,%0\n\t":"=r"(contents));
+ break;
+ case 4:
+ __asm__ (_ASM_MOV " %%cr4,%0\n\t":"=r"(contents));
+ break;
+ default:
+ break;
+ }
+
+ return contents;
+}
+
+void
+kdba_putdr(int regnum, kdb_machreg_t contents)
+{
+ switch(regnum) {
+ case 0:
+ __asm__ (_ASM_MOV " %0,%%db0\n\t"::"r"(contents));
+ break;
+ case 1:
+ __asm__ (_ASM_MOV " %0,%%db1\n\t"::"r"(contents));
+ break;
+ case 2:
+ __asm__ (_ASM_MOV " %0,%%db2\n\t"::"r"(contents));
+ break;
+ case 3:
+ __asm__ (_ASM_MOV " %0,%%db3\n\t"::"r"(contents));
+ break;
+ case 4:
+ case 5:
+ break;
+ case 6:
+ __asm__ (_ASM_MOV " %0,%%db6\n\t"::"r"(contents));
+ break;
+ case 7:
+ __asm__ (_ASM_MOV " %0,%%db7\n\t"::"r"(contents));
+ break;
+ default:
+ break;
+ }
+}
+
+kdb_machreg_t
+kdba_getdr(int regnum)
+{
+ kdb_machreg_t contents = 0;
+ switch(regnum) {
+ case 0:
+ __asm__ (_ASM_MOV " %%db0,%0\n\t":"=r"(contents));
+ break;
+ case 1:
+ __asm__ (_ASM_MOV " %%db1,%0\n\t":"=r"(contents));
+ break;
+ case 2:
+ __asm__ (_ASM_MOV " %%db2,%0\n\t":"=r"(contents));
+ break;
+ case 3:
+ __asm__ (_ASM_MOV " %%db3,%0\n\t":"=r"(contents));
+ break;
+ case 4:
+ case 5:
+ break;
+ case 6:
+ __asm__ (_ASM_MOV " %%db6,%0\n\t":"=r"(contents));
+ break;
+ case 7:
+ __asm__ (_ASM_MOV " %%db7,%0\n\t":"=r"(contents));
+ break;
+ default:
+ break;
+ }
+
+ return contents;
+}
+
+kdb_machreg_t
+kdba_getdr6(void)
+{
+ return kdba_getdr(6);
+}
+
+kdb_machreg_t
+kdba_getdr7(void)
+{
+ return kdba_getdr(7);
+}
+
+void
+kdba_putdr6(kdb_machreg_t contents)
+{
+ kdba_putdr(6, contents);
+}
+
+static void
+kdba_putdr7(kdb_machreg_t contents)
+{
+ kdba_putdr(7, contents);
+}
+
+void
+kdba_installdbreg(kdb_bp_t *bp)
+{
+ int cpu = smp_processor_id();
+
+ kdb_machreg_t dr7;
+
+ dr7 = kdba_getdr7();
+
+ kdba_putdr(bp->bp_hard[cpu]->bph_reg, bp->bp_addr);
+
+ dr7 |= DR7_GE;
+ if (cpu_has_de)
+ set_in_cr4(X86_CR4_DE);
+
+ switch (bp->bp_hard[cpu]->bph_reg){
+ case 0:
+ DR7_RW0SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ DR7_LEN0SET(dr7,bp->bp_hard[cpu]->bph_length);
+ DR7_G0SET(dr7);
+ break;
+ case 1:
+ DR7_RW1SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ DR7_LEN1SET(dr7,bp->bp_hard[cpu]->bph_length);
+ DR7_G1SET(dr7);
+ break;
+ case 2:
+ DR7_RW2SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ DR7_LEN2SET(dr7,bp->bp_hard[cpu]->bph_length);
+ DR7_G2SET(dr7);
+ break;
+ case 3:
+ DR7_RW3SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ DR7_LEN3SET(dr7,bp->bp_hard[cpu]->bph_length);
+ DR7_G3SET(dr7);
+ break;
+ default:
+ kdb_printf("kdb: Bad debug register!! %ld\n",
+ bp->bp_hard[cpu]->bph_reg);
+ break;
+ }
+
+ kdba_putdr7(dr7);
+ return;
+}
+
+void
+kdba_removedbreg(kdb_bp_t *bp)
+{
+ int regnum;
+ kdb_machreg_t dr7;
+ int cpu = smp_processor_id();
+
+ if (!bp->bp_hard[cpu])
+ return;
+
+ regnum = bp->bp_hard[cpu]->bph_reg;
+
+ dr7 = kdba_getdr7();
+
+ kdba_putdr(regnum, 0);
+
+ switch (regnum) {
+ case 0:
+ DR7_G0CLR(dr7);
+ DR7_L0CLR(dr7);
+ break;
+ case 1:
+ DR7_G1CLR(dr7);
+ DR7_L1CLR(dr7);
+ break;
+ case 2:
+ DR7_G2CLR(dr7);
+ DR7_L2CLR(dr7);
+ break;
+ case 3:
+ DR7_G3CLR(dr7);
+ DR7_L3CLR(dr7);
+ break;
+ default:
+ kdb_printf("kdb: Bad debug register!! %d\n", regnum);
+ break;
+ }
+
+ kdba_putdr7(dr7);
+}
+
+struct kdbregs {
+ char *reg_name;
+ size_t reg_offset;
+};
+
+static struct kdbregs dbreglist[] = {
+ { "dr0", 0 },
+ { "dr1", 1 },
+ { "dr2", 2 },
+ { "dr3", 3 },
+ { "dr6", 6 },
+ { "dr7", 7 },
+};
+
+static const int ndbreglist = sizeof(dbreglist) / sizeof(struct kdbregs);
+
+#ifdef CONFIG_X86_32
+static struct kdbregs kdbreglist[] = {
+ { "ax", offsetof(struct pt_regs, ax) },
+ { "bx", offsetof(struct pt_regs, bx) },
+ { "cx", offsetof(struct pt_regs, cx) },
+ { "dx", offsetof(struct pt_regs, dx) },
+
+ { "si", offsetof(struct pt_regs, si) },
+ { "di", offsetof(struct pt_regs, di) },
+ { "sp", offsetof(struct pt_regs, sp) },
+ { "ip", offsetof(struct pt_regs, ip) },
+
+ { "bp", offsetof(struct pt_regs, bp) },
+ { "ss", offsetof(struct pt_regs, ss) },
+ { "cs", offsetof(struct pt_regs, cs) },
+ { "flags", offsetof(struct pt_regs, flags) },
+
+ { "ds", offsetof(struct pt_regs, ds) },
+ { "es", offsetof(struct pt_regs, es) },
+ { "origax", offsetof(struct pt_regs, orig_ax) },
+
+};
+
+static const int nkdbreglist = sizeof(kdbreglist) / sizeof(struct kdbregs);
+
+
+/*
+ * kdba_getregcontents
+ *
+ * Return the contents of the register specified by the
+ * input string argument. Return an error if the string
+ * does not match a machine register.
+ *
+ * The following pseudo register names are supported:
+ * ®s - Prints address of exception frame
+ * kesp - Prints kernel stack pointer at time of fault
+ * cesp - Prints current kernel stack pointer, inside kdb
+ * ceflags - Prints current flags, inside kdb
+ * %<regname> - Uses the value of the registers at the
+ * last time the user process entered kernel
+ * mode, instead of the registers at the time
+ * kdb was entered.
+ *
+ * Parameters:
+ * regname Pointer to string naming register
+ * regs Pointer to structure containing registers.
+ * Outputs:
+ * *contents Pointer to unsigned long to recieve register contents
+ * Returns:
+ * 0 Success
+ * KDB_BADREG Invalid register name
+ * Locking:
+ * None.
+ * Remarks:
+ * If kdb was entered via an interrupt from the kernel itself then
+ * ss and sp are *not* on the stack.
+ */
+
+int
+kdba_getregcontents(const char *regname,
+ struct pt_regs *regs,
+ kdb_machreg_t *contents)
+{
+ int i;
+
+ if (strcmp(regname, "cesp") == 0) {
+ asm volatile("movl %%esp,%0":"=m" (*contents));
+ return 0;
+ }
+
+ if (strcmp(regname, "ceflags") == 0) {
+ unsigned long flags;
+ local_save_flags(flags);
+ *contents = flags;
+ return 0;
+ }
+
+ if (regname[0] == '%') {
+ /* User registers: %%e[a-c]x, etc */
+ regname++;
+ regs = (struct pt_regs *)
+ (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
+ }
+
+ for (i=0; i<ndbreglist; i++) {
+ if (strnicmp(dbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < ndbreglist)
+ && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ *contents = kdba_getdr(dbreglist[i].reg_offset);
+ return 0;
+ }
+
+ if (!regs) {
+ kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
+ return KDB_BADREG;
+ }
+
+ if (strcmp(regname, "®s") == 0) {
+ *contents = (unsigned long)regs;
+ return 0;
+ }
+
+ if (strcmp(regname, "kesp") == 0) {
+ *contents = (unsigned long)regs + sizeof(struct pt_regs);
+ if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ /* sp and ss are not on stack */
+ *contents -= 2*4;
+ }
+ return 0;
+ }
+
+ for (i=0; i<nkdbreglist; i++) {
+ if (strnicmp(kdbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < nkdbreglist)
+ && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ /* No cpl switch, sp and ss are not on stack */
+ if (strcmp(kdbreglist[i].reg_name, "sp") == 0) {
+ *contents = (kdb_machreg_t)regs +
+ sizeof(struct pt_regs) - 2*4;
+ return(0);
+ }
+ if (strcmp(kdbreglist[i].reg_name, "xss") == 0) {
+ asm volatile(
+ "pushl %%ss\n"
+ "popl %0\n"
+ :"=m" (*contents));
+ return(0);
+ }
+ }
+ *contents = *(unsigned long *)((unsigned long)regs +
+ kdbreglist[i].reg_offset);
+ return(0);
+ }
+
+ return KDB_BADREG;
+}
+
+/*
+ * kdba_setregcontents
+ *
+ * Set the contents of the register specified by the
+ * input string argument. Return an error if the string
+ * does not match a machine register.
+ *
+ * Supports modification of user-mode registers via
+ * %<register-name>
+ *
+ * Parameters:
+ * regname Pointer to string naming register
+ * regs Pointer to structure containing registers.
+ * contents Unsigned long containing new register contents
+ * Outputs:
+ * Returns:
+ * 0 Success
+ * KDB_BADREG Invalid register name
+ * Locking:
+ * None.
+ * Remarks:
+ */
+
+int
+kdba_setregcontents(const char *regname,
+ struct pt_regs *regs,
+ unsigned long contents)
+{
+ int i;
+
+ if (regname[0] == '%') {
+ regname++;
+ regs = (struct pt_regs *)
+ (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
+ }
+
+ for (i=0; i<ndbreglist; i++) {
+ if (strnicmp(dbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < ndbreglist)
+ && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ kdba_putdr(dbreglist[i].reg_offset, contents);
+ return 0;
+ }
+
+ if (!regs) {
+ kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
+ return KDB_BADREG;
+ }
+
+ for (i=0; i<nkdbreglist; i++) {
+ if (strnicmp(kdbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < nkdbreglist)
+ && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ *(unsigned long *)((unsigned long)regs
+ + kdbreglist[i].reg_offset) = contents;
+ return 0;
+ }
+
+ return KDB_BADREG;
+}
+
+/*
+ * kdba_pt_regs
+ *
+ * Format a struct pt_regs
+ *
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * If no address is supplied, it uses the last irq pt_regs.
+ */
+
+static int
+kdba_pt_regs(int argc, const char **argv)
+{
+ int diag;
+ kdb_machreg_t addr;
+ long offset = 0;
+ int nextarg;
+ struct pt_regs *p;
+ static const char *fmt = " %-11.11s 0x%lx\n";
+
+ if (argc == 0) {
+ addr = (kdb_machreg_t) get_irq_regs();
+ } else if (argc == 1) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ } else {
+ return KDB_ARGCOUNT;
+ }
+
+ p = (struct pt_regs *) addr;
+ kdb_printf("struct pt_regs 0x%p-0x%p\n", p, (unsigned char *)p + sizeof(*p) - 1);
+ kdb_print_nameval("bx", p->bx);
+ kdb_print_nameval("cx", p->cx);
+ kdb_print_nameval("dx", p->dx);
+ kdb_print_nameval("si", p->si);
+ kdb_print_nameval("di", p->di);
+ kdb_print_nameval("bp", p->bp);
+ kdb_print_nameval("ax", p->ax);
+ kdb_printf(fmt, "ds", p->ds);
+ kdb_printf(fmt, "es", p->es);
+ kdb_print_nameval("orig_ax", p->orig_ax);
+ kdb_print_nameval("ip", p->ip);
+ kdb_printf(fmt, "cs", p->cs);
+ kdb_printf(fmt, "flags", p->flags);
+ kdb_printf(fmt, "sp", p->sp);
+ kdb_printf(fmt, "ss", p->ss);
+ return 0;
+}
+
+#else /* CONFIG_X86_32 */
+
+static struct kdbregs kdbreglist[] = {
+ { "r15", offsetof(struct pt_regs, r15) },
+ { "r14", offsetof(struct pt_regs, r14) },
+ { "r13", offsetof(struct pt_regs, r13) },
+ { "r12", offsetof(struct pt_regs, r12) },
+ { "bp", offsetof(struct pt_regs, bp) },
+ { "bx", offsetof(struct pt_regs, bx) },
+ { "r11", offsetof(struct pt_regs, r11) },
+ { "r10", offsetof(struct pt_regs, r10) },
+ { "r9", offsetof(struct pt_regs, r9) },
+ { "r8", offsetof(struct pt_regs, r8) },
+ { "ax", offsetof(struct pt_regs, ax) },
+ { "cx", offsetof(struct pt_regs, cx) },
+ { "dx", offsetof(struct pt_regs, dx) },
+ { "si", offsetof(struct pt_regs, si) },
+ { "di", offsetof(struct pt_regs, di) },
+ { "orig_ax", offsetof(struct pt_regs, orig_ax) },
+ { "ip", offsetof(struct pt_regs, ip) },
+ { "cs", offsetof(struct pt_regs, cs) },
+ { "flags", offsetof(struct pt_regs, flags) },
+ { "sp", offsetof(struct pt_regs, sp) },
+ { "ss", offsetof(struct pt_regs, ss) },
+};
+
+static const int nkdbreglist = sizeof(kdbreglist) / sizeof(struct kdbregs);
+
+
+/*
+ * kdba_getregcontents
+ *
+ * Return the contents of the register specified by the
+ * input string argument. Return an error if the string
+ * does not match a machine register.
+ *
+ * The following pseudo register names are supported:
+ * ®s - Prints address of exception frame
+ * krsp - Prints kernel stack pointer at time of fault
+ * crsp - Prints current kernel stack pointer, inside kdb
+ * ceflags - Prints current flags, inside kdb
+ * %<regname> - Uses the value of the registers at the
+ * last time the user process entered kernel
+ * mode, instead of the registers at the time
+ * kdb was entered.
+ *
+ * Parameters:
+ * regname Pointer to string naming register
+ * regs Pointer to structure containing registers.
+ * Outputs:
+ * *contents Pointer to unsigned long to recieve register contents
+ * Returns:
+ * 0 Success
+ * KDB_BADREG Invalid register name
+ * Locking:
+ * None.
+ * Remarks:
+ * If kdb was entered via an interrupt from the kernel itself then
+ * ss and sp are *not* on the stack.
+ */
+int
+kdba_getregcontents(const char *regname,
+ struct pt_regs *regs,
+ kdb_machreg_t *contents)
+{
+ int i;
+
+ if (strcmp(regname, "®s") == 0) {
+ *contents = (unsigned long)regs;
+ return 0;
+ }
+
+ if (strcmp(regname, "krsp") == 0) {
+ *contents = (unsigned long)regs + sizeof(struct pt_regs);
+ if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ /* sp and ss are not on stack */
+ *contents -= 2*4;
+ }
+ return 0;
+ }
+
+ if (strcmp(regname, "crsp") == 0) {
+ asm volatile("movq %%rsp,%0":"=m" (*contents));
+ return 0;
+ }
+
+ if (strcmp(regname, "ceflags") == 0) {
+ unsigned long flags;
+ local_save_flags(flags);
+ *contents = flags;
+ return 0;
+ }
+
+ if (regname[0] == '%') {
+ /* User registers: %%r[a-c]x, etc */
+ regname++;
+ regs = (struct pt_regs *)
+ (current->thread.sp0 - sizeof(struct pt_regs));
+ }
+
+ for (i=0; i<nkdbreglist; i++) {
+ if (strnicmp(kdbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < nkdbreglist)
+ && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ /* No cpl switch, sp is not on stack */
+ if (strcmp(kdbreglist[i].reg_name, "sp") == 0) {
+ *contents = (kdb_machreg_t)regs +
+ sizeof(struct pt_regs) - 2*8;
+ return(0);
+ }
+#if 0 /* FIXME */
+ if (strcmp(kdbreglist[i].reg_name, "ss") == 0) {
+ kdb_machreg_t r;
+
+ r = (kdb_machreg_t)regs +
+ sizeof(struct pt_regs) - 2*8;
+ *contents = (kdb_machreg_t)SS(r); /* XXX */
+ return(0);
+ }
+#endif
+ }
+ *contents = *(unsigned long *)((unsigned long)regs +
+ kdbreglist[i].reg_offset);
+ return(0);
+ }
+
+ for (i=0; i<ndbreglist; i++) {
+ if (strnicmp(dbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < ndbreglist)
+ && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ *contents = kdba_getdr(dbreglist[i].reg_offset);
+ return 0;
+ }
+ return KDB_BADREG;
+}
+
+/*
+ * kdba_setregcontents
+ *
+ * Set the contents of the register specified by the
+ * input string argument. Return an error if the string
+ * does not match a machine register.
+ *
+ * Supports modification of user-mode registers via
+ * %<register-name>
+ *
+ * Parameters:
+ * regname Pointer to string naming register
+ * regs Pointer to structure containing registers.
+ * contents Unsigned long containing new register contents
+ * Outputs:
+ * Returns:
+ * 0 Success
+ * KDB_BADREG Invalid register name
+ * Locking:
+ * None.
+ * Remarks:
+ */
+
+int
+kdba_setregcontents(const char *regname,
+ struct pt_regs *regs,
+ unsigned long contents)
+{
+ int i;
+
+ if (regname[0] == '%') {
+ regname++;
+ regs = (struct pt_regs *)
+ (current->thread.sp0 - sizeof(struct pt_regs));
+ }
+
+ for (i=0; i<nkdbreglist; i++) {
+ if (strnicmp(kdbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < nkdbreglist)
+ && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ *(unsigned long *)((unsigned long)regs
+ + kdbreglist[i].reg_offset) = contents;
+ return 0;
+ }
+
+ for (i=0; i<ndbreglist; i++) {
+ if (strnicmp(dbreglist[i].reg_name,
+ regname,
+ strlen(regname)) == 0)
+ break;
+ }
+
+ if ((i < ndbreglist)
+ && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ kdba_putdr(dbreglist[i].reg_offset, contents);
+ return 0;
+ }
+
+ return KDB_BADREG;
+}
+
+/*
+ * kdba_pt_regs
+ *
+ * Format a struct pt_regs
+ *
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * If no address is supplied, it uses the last irq pt_regs.
+ */
+
+static int
+kdba_pt_regs(int argc, const char **argv)
+{
+ int diag;
+ kdb_machreg_t addr;
+ long offset = 0;
+ int nextarg;
+ struct pt_regs *p;
+ static const char *fmt = " %-11.11s 0x%lx\n";
+ static int first_time = 1;
+
+ if (argc == 0) {
+ addr = (kdb_machreg_t) get_irq_regs();
+ } else if (argc == 1) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ } else {
+ return KDB_ARGCOUNT;
+ }
+
+ p = (struct pt_regs *) addr;
+ if (first_time) {
+ first_time = 0;
+ kdb_printf("\n+++ Warning: x86_64 pt_regs are not always "
+ "completely defined, r15-bx may be invalid\n\n");
+ }
+ kdb_printf("struct pt_regs 0x%p-0x%p\n", p, (unsigned char *)p + sizeof(*p) - 1);
+ kdb_print_nameval("r15", p->r15);
+ kdb_print_nameval("r14", p->r14);
+ kdb_print_nameval("r13", p->r13);
+ kdb_print_nameval("r12", p->r12);
+ kdb_print_nameval("bp", p->bp);
+ kdb_print_nameval("bx", p->bx);
+ kdb_print_nameval("r11", p->r11);
+ kdb_print_nameval("r10", p->r10);
+ kdb_print_nameval("r9", p->r9);
+ kdb_print_nameval("r8", p->r8);
+ kdb_print_nameval("ax", p->ax);
+ kdb_print_nameval("cx", p->cx);
+ kdb_print_nameval("dx", p->dx);
+ kdb_print_nameval("si", p->si);
+ kdb_print_nameval("di", p->di);
+ kdb_print_nameval("orig_ax", p->orig_ax);
+ kdb_print_nameval("ip", p->ip);
+ kdb_printf(fmt, "cs", p->cs);
+ kdb_printf(fmt, "flags", p->flags);
+ kdb_printf(fmt, "sp", p->sp);
+ kdb_printf(fmt, "ss", p->ss);
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+/*
+ * kdba_dumpregs
+ *
+ * Dump the specified register set to the display.
+ *
+ * Parameters:
+ * regs Pointer to structure containing registers.
+ * type Character string identifying register set to dump
+ * extra string further identifying register (optional)
+ * Outputs:
+ * Returns:
+ * 0 Success
+ * Locking:
+ * None.
+ * Remarks:
+ * This function will dump the general register set if the type
+ * argument is NULL (struct pt_regs). The alternate register
+ * set types supported by this function:
+ *
+ * d Debug registers
+ * c Control registers
+ * u User registers at most recent entry to kernel
+ * for the process currently selected with "pid" command.
+ * Following not yet implemented:
+ * r Memory Type Range Registers (extra defines register)
+ *
+ * MSR on i386/x86_64 are handled by rdmsr/wrmsr commands.
+ */
+
+int
+kdba_dumpregs(struct pt_regs *regs,
+ const char *type,
+ const char *extra)
+{
+ int i;
+ int count = 0;
+
+ if (type
+ && (type[0] == 'u')) {
+ type = NULL;
+ regs = (struct pt_regs *)
+ (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
+ }
+
+ if (type == NULL) {
+ struct kdbregs *rlp;
+ kdb_machreg_t contents;
+
+ if (!regs) {
+ kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
+ return KDB_BADREG;
+ }
+
+#ifdef CONFIG_X86_32
+ for (i=0, rlp=kdbreglist; i<nkdbreglist; i++,rlp++) {
+ kdb_printf("%s = ", rlp->reg_name);
+ kdba_getregcontents(rlp->reg_name, regs, &contents);
+ kdb_printf("0x%08lx ", contents);
+ if ((++count % 4) == 0)
+ kdb_printf("\n");
+ }
+#else
+ for (i=0, rlp=kdbreglist; i<nkdbreglist; i++,rlp++) {
+ kdb_printf("%8s = ", rlp->reg_name);
+ kdba_getregcontents(rlp->reg_name, regs, &contents);
+ kdb_printf("0x%016lx ", contents);
+ if ((++count % 2) == 0)
+ kdb_printf("\n");
+ }
+#endif
+
+ kdb_printf("®s = 0x%p\n", regs);
+
+ return 0;
+ }
+
+ switch (type[0]) {
+ case 'd':
+ {
+ unsigned long dr[8];
+
+ for(i=0; i<8; i++) {
+ if ((i == 4) || (i == 5)) continue;
+ dr[i] = kdba_getdr(i);
+ }
+ kdb_printf("dr0 = 0x%08lx dr1 = 0x%08lx dr2 = 0x%08lx dr3 = 0x%08lx\n",
+ dr[0], dr[1], dr[2], dr[3]);
+ kdb_printf("dr6 = 0x%08lx dr7 = 0x%08lx\n",
+ dr[6], dr[7]);
+ return 0;
+ }
+ case 'c':
+ {
+ unsigned long cr[5];
+
+ for (i=0; i<5; i++) {
+ cr[i] = kdba_getcr(i);
+ }
+ kdb_printf("cr0 = 0x%08lx cr1 = 0x%08lx cr2 = 0x%08lx cr3 = 0x%08lx\ncr4 = 0x%08lx\n",
+ cr[0], cr[1], cr[2], cr[3], cr[4]);
+ return 0;
+ }
+ case 'r':
+ break;
+ default:
+ return KDB_BADREG;
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
+EXPORT_SYMBOL(kdba_dumpregs);
+
+kdb_machreg_t
+kdba_getpc(struct pt_regs *regs)
+{
+ return regs ? regs->ip : 0;
+}
+
+int
+kdba_setpc(struct pt_regs *regs, kdb_machreg_t newpc)
+{
+ if (KDB_NULL_REGS(regs))
+ return KDB_BADREG;
+ regs->ip = newpc;
+ KDB_STATE_SET(IP_ADJUSTED);
+ return 0;
+}
+
+/*
+ * kdba_main_loop
+ *
+ * Do any architecture specific set up before entering the main kdb loop.
+ * The primary function of this routine is to make all processes look the
+ * same to kdb, kdb must be able to list a process without worrying if the
+ * process is running or blocked, so make all process look as though they
+ * are blocked.
+ *
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * error2 kdb's current reason code. Initially error but can change
+ * acording to kdb state.
+ * db_result Result from break or debug point.
+ * regs The exception frame at time of fault/breakpoint. If reason
+ * is SILENT or CPU_UP then regs is NULL, otherwise it should
+ * always be valid.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ * Outputs:
+ * Sets ip and sp in current->thread.
+ * Locking:
+ * None.
+ * Remarks:
+ * none.
+ */
+
+int
+kdba_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+ kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+ int ret;
+
+#ifdef CONFIG_X86_64
+ if (regs)
+ kdba_getregcontents("sp", regs, &(current->thread.sp));
+#endif
+ ret = kdb_save_running(regs, reason, reason2, error, db_result);
+ kdb_unsave_running(regs);
+ return ret;
+}
+
+void
+kdba_disableint(kdb_intstate_t *state)
+{
+ unsigned long *fp = (unsigned long *)state;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ *fp = flags;
+}
+
+void
+kdba_restoreint(kdb_intstate_t *state)
+{
+ unsigned long flags = *(unsigned long *)state;
+ local_irq_restore(flags);
+}
+
+void
+kdba_setsinglestep(struct pt_regs *regs)
+{
+ if (KDB_NULL_REGS(regs))
+ return;
+ if (regs->flags & X86_EFLAGS_IF)
+ KDB_STATE_SET(A_IF);
+ else
+ KDB_STATE_CLEAR(A_IF);
+ regs->flags = (regs->flags | X86_EFLAGS_TF) & ~X86_EFLAGS_IF;
+}
+
+void
+kdba_clearsinglestep(struct pt_regs *regs)
+{
+ if (KDB_NULL_REGS(regs))
+ return;
+ if (KDB_STATE(A_IF))
+ regs->flags |= X86_EFLAGS_IF;
+ else
+ regs->flags &= ~X86_EFLAGS_IF;
+}
+
+#ifdef CONFIG_X86_32
+int asmlinkage
+kdba_setjmp(kdb_jmp_buf *jb)
+{
+#ifdef CONFIG_FRAME_POINTER
+ __asm__ ("movl 8(%esp), %eax\n\t"
+ "movl %ebx, 0(%eax)\n\t"
+ "movl %esi, 4(%eax)\n\t"
+ "movl %edi, 8(%eax)\n\t"
+ "movl (%esp), %ecx\n\t"
+ "movl %ecx, 12(%eax)\n\t"
+ "leal 8(%esp), %ecx\n\t"
+ "movl %ecx, 16(%eax)\n\t"
+ "movl 4(%esp), %ecx\n\t"
+ "movl %ecx, 20(%eax)\n\t");
+#else /* CONFIG_FRAME_POINTER */
+ __asm__ ("movl 4(%esp), %eax\n\t"
+ "movl %ebx, 0(%eax)\n\t"
+ "movl %esi, 4(%eax)\n\t"
+ "movl %edi, 8(%eax)\n\t"
+ "movl %ebp, 12(%eax)\n\t"
+ "leal 4(%esp), %ecx\n\t"
+ "movl %ecx, 16(%eax)\n\t"
+ "movl 0(%esp), %ecx\n\t"
+ "movl %ecx, 20(%eax)\n\t");
+#endif /* CONFIG_FRAME_POINTER */
+ return 0;
+}
+
+void asmlinkage
+kdba_longjmp(kdb_jmp_buf *jb, int reason)
+{
+#ifdef CONFIG_FRAME_POINTER
+ __asm__("movl 8(%esp), %ecx\n\t"
+ "movl 12(%esp), %eax\n\t"
+ "movl 20(%ecx), %edx\n\t"
+ "movl 0(%ecx), %ebx\n\t"
+ "movl 4(%ecx), %esi\n\t"
+ "movl 8(%ecx), %edi\n\t"
+ "movl 12(%ecx), %ebp\n\t"
+ "movl 16(%ecx), %esp\n\t"
+ "jmp *%edx\n");
+#else /* CONFIG_FRAME_POINTER */
+ __asm__("movl 4(%esp), %ecx\n\t"
+ "movl 8(%esp), %eax\n\t"
+ "movl 20(%ecx), %edx\n\t"
+ "movl 0(%ecx), %ebx\n\t"
+ "movl 4(%ecx), %esi\n\t"
+ "movl 8(%ecx), %edi\n\t"
+ "movl 12(%ecx), %ebp\n\t"
+ "movl 16(%ecx), %esp\n\t"
+ "jmp *%edx\n");
+#endif /* CONFIG_FRAME_POINTER */
+}
+
+#else /* CONFIG_X86_32 */
+
+int asmlinkage
+kdba_setjmp(kdb_jmp_buf *jb)
+{
+#ifdef CONFIG_FRAME_POINTER
+ __asm__ __volatile__
+ ("movq %%rbx, (0*8)(%%rdi);"
+ "movq %%rcx, (1*8)(%%rdi);"
+ "movq %%r12, (2*8)(%%rdi);"
+ "movq %%r13, (3*8)(%%rdi);"
+ "movq %%r14, (4*8)(%%rdi);"
+ "movq %%r15, (5*8)(%%rdi);"
+ "leaq 16(%%rsp), %%rdx;"
+ "movq %%rdx, (6*8)(%%rdi);"
+ "movq %%rax, (7*8)(%%rdi)"
+ :
+ : "a" (__builtin_return_address(0)),
+ "c" (__builtin_frame_address(1))
+ );
+#else /* !CONFIG_FRAME_POINTER */
+ __asm__ __volatile__
+ ("movq %%rbx, (0*8)(%%rdi);"
+ "movq %%rbp, (1*8)(%%rdi);"
+ "movq %%r12, (2*8)(%%rdi);"
+ "movq %%r13, (3*8)(%%rdi);"
+ "movq %%r14, (4*8)(%%rdi);"
+ "movq %%r15, (5*8)(%%rdi);"
+ "leaq 8(%%rsp), %%rdx;"
+ "movq %%rdx, (6*8)(%%rdi);"
+ "movq %%rax, (7*8)(%%rdi)"
+ :
+ : "a" (__builtin_return_address(0))
+ );
+#endif /* CONFIG_FRAME_POINTER */
+ return 0;
+}
+
+void asmlinkage
+kdba_longjmp(kdb_jmp_buf *jb, int reason)
+{
+ __asm__("movq (0*8)(%rdi),%rbx;"
+ "movq (1*8)(%rdi),%rbp;"
+ "movq (2*8)(%rdi),%r12;"
+ "movq (3*8)(%rdi),%r13;"
+ "movq (4*8)(%rdi),%r14;"
+ "movq (5*8)(%rdi),%r15;"
+ "movq (7*8)(%rdi),%rdx;"
+ "movq (6*8)(%rdi),%rsp;"
+ "mov %rsi, %rax;"
+ "jmpq *%rdx");
+}
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_32
+/*
+ * kdba_stackdepth
+ *
+ * Print processes that are using more than a specific percentage of their
+ * stack.
+ *
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * If no percentage is supplied, it uses 60.
+ */
+
+static void
+kdba_stackdepth1(struct task_struct *p, unsigned long sp)
+{
+ struct thread_info *tinfo;
+ int used;
+ const char *type;
+ kdb_ps1(p);
+ do {
+ tinfo = (struct thread_info *)(sp & -THREAD_SIZE);
+ used = sizeof(*tinfo) + THREAD_SIZE - (sp & (THREAD_SIZE-1));
+ type = NULL;
+ if (kdb_task_has_cpu(p)) {
+ struct kdb_activation_record ar;
+ memset(&ar, 0, sizeof(ar));
+ kdba_get_stack_info_alternate(sp, -1, &ar);
+ type = ar.stack.id;
+ }
+ if (!type)
+ type = "process";
+ kdb_printf(" %s stack %p sp %lx used %d\n", type, tinfo, sp, used);
+ sp = tinfo->previous_esp;
+ } while (sp);
+}
+
+static int
+kdba_stackdepth(int argc, const char **argv)
+{
+ int diag, cpu, threshold, used, over;
+ unsigned long percentage;
+ unsigned long esp;
+ long offset = 0;
+ int nextarg;
+ struct task_struct *p, *g;
+ struct kdb_running_process *krp;
+ struct thread_info *tinfo;
+
+ if (argc == 0) {
+ percentage = 60;
+ } else if (argc == 1) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &percentage, &offset, NULL);
+ if (diag)
+ return diag;
+ } else {
+ return KDB_ARGCOUNT;
+ }
+ percentage = max_t(int, percentage, 1);
+ percentage = min_t(int, percentage, 100);
+ threshold = ((2 * THREAD_SIZE * percentage) / 100 + 1) >> 1;
+ kdb_printf("stackdepth: processes using more than %ld%% (%d bytes) of stack\n",
+ percentage, threshold);
+
+ /* Run the active tasks first, they can have multiple stacks */
+ for (cpu = 0, krp = kdb_running_process; cpu < NR_CPUS; ++cpu, ++krp) {
+ if (!cpu_online(cpu))
+ continue;
+ p = krp->p;
+ esp = krp->arch.sp;
+ over = 0;
+ do {
+ tinfo = (struct thread_info *)(esp & -THREAD_SIZE);
+ used = sizeof(*tinfo) + THREAD_SIZE - (esp & (THREAD_SIZE-1));
+ if (used >= threshold)
+ over = 1;
+ esp = tinfo->previous_esp;
+ } while (esp);
+ if (over)
+ kdba_stackdepth1(p, krp->arch.sp);
+ }
+ /* Now the tasks that are not on cpus */
+ kdb_do_each_thread(g, p) {
+ if (kdb_task_has_cpu(p))
+ continue;
+ esp = p->thread.sp;
+ used = sizeof(*tinfo) + THREAD_SIZE - (esp & (THREAD_SIZE-1));
+ over = used >= threshold;
+ if (over)
+ kdba_stackdepth1(p, esp);
+ } kdb_while_each_thread(g, p);
+
+ return 0;
+}
+#else /* CONFIG_X86_32 */
+
+
+/*
+ * kdba_entry
+ *
+ * This is the interface routine between
+ * the notifier die_chain and kdb
+ */
+static int kdba_entry( struct notifier_block *b, unsigned long val, void *v)
+{
+ struct die_args *args = v;
+ int err, trap, ret = 0;
+ struct pt_regs *regs;
+
+ regs = args->regs;
+ err = args->err;
+ trap = args->trapnr;
+ switch (val){
+#ifdef CONFIG_SMP
+ case DIE_NMI_IPI:
+ ret = kdb_ipi(regs, NULL);
+ break;
+#endif /* CONFIG_SMP */
+ case DIE_OOPS:
+ ret = kdb(KDB_REASON_OOPS, err, regs);
+ break;
+ case DIE_CALL:
+ ret = kdb(KDB_REASON_ENTER, err, regs);
+ break;
+ case DIE_DEBUG:
+ ret = kdb(KDB_REASON_DEBUG, err, regs);
+ break;
+ case DIE_NMIWATCHDOG:
+ ret = kdb(KDB_REASON_NMI, err, regs);
+ break;
+ case DIE_INT3:
+ ret = kdb(KDB_REASON_BREAK, err, regs);
+ // falls thru
+ default:
+ break;
+ }
+ return (ret ? NOTIFY_STOP : NOTIFY_DONE);
+}
+
+/*
+ * notifier block for kdb entry
+ */
+static struct notifier_block kdba_notifier = {
+ .notifier_call = kdba_entry
+};
+#endif /* CONFIG_X86_32 */
+
+asmlinkage int kdb_call(void);
+
+/* Executed once on each cpu at startup. */
+void
+kdba_cpu_up(void)
+{
+}
+
+static int __init
+kdba_arch_init(void)
+{
+ set_intr_gate(KDBENTER_VECTOR, kdb_call);
+ return 0;
+}
+
+arch_initcall(kdba_arch_init);
+
+/*
+ * kdba_init
+ *
+ * Architecture specific initialization.
+ *
+ * Parameters:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ * None.
+ */
+
+void __init
+kdba_init(void)
+{
+ kdba_arch_init(); /* Need to register KDBENTER_VECTOR early */
+ kdb_register("pt_regs", kdba_pt_regs, "address", "Format struct pt_regs", 0);
+#ifdef CONFIG_X86_32
+ kdb_register("stackdepth", kdba_stackdepth, "[percentage]", "Print processes using >= stack percentage", 0);
+#else
+ register_die_notifier(&kdba_notifier);
+#endif
+ return;
+}
+
+/*
+ * kdba_adjust_ip
+ *
+ * Architecture specific adjustment of instruction pointer before leaving
+ * kdb.
+ *
+ * Parameters:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * regs The exception frame at time of fault/breakpoint. If reason
+ * is SILENT or CPU_UP then regs is NULL, otherwise it should
+ * always be valid.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ * noop on ix86.
+ */
+
+void
+kdba_adjust_ip(kdb_reason_t reason, int error, struct pt_regs *regs)
+{
+ return;
+}
+
+void
+kdba_set_current_task(const struct task_struct *p)
+{
+ kdb_current_task = p;
+ if (kdb_task_has_cpu(p)) {
+ struct kdb_running_process *krp = kdb_running_process + kdb_process_cpu(p);
+ kdb_current_regs = krp->regs;
+ return;
+ }
+ kdb_current_regs = NULL;
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * asm-i386 uaccess.h supplies __copy_to_user which relies on MMU to
+ * trap invalid addresses in the _xxx fields. Verify the other address
+ * of the pair is valid by accessing the first and last byte ourselves,
+ * then any access violations should only be caused by the _xxx
+ * addresses,
+ */
+
+int
+kdba_putarea_size(unsigned long to_xxx, void *from, size_t size)
+{
+ mm_segment_t oldfs = get_fs();
+ int r;
+ char c;
+ c = *((volatile char *)from);
+ c = *((volatile char *)from + size - 1);
+
+ if (to_xxx < PAGE_OFFSET) {
+ return kdb_putuserarea_size(to_xxx, from, size);
+ }
+
+ set_fs(KERNEL_DS);
+ r = __copy_to_user_inatomic((void __user *)to_xxx, from, size);
+ set_fs(oldfs);
+ return r;
+}
+
+int
+kdba_getarea_size(void *to, unsigned long from_xxx, size_t size)
+{
+ mm_segment_t oldfs = get_fs();
+ int r;
+ *((volatile char *)to) = '\0';
+ *((volatile char *)to + size - 1) = '\0';
+
+ if (from_xxx < PAGE_OFFSET) {
+ return kdb_getuserarea_size(to, from_xxx, size);
+ }
+
+ set_fs(KERNEL_DS);
+ switch (size) {
+ case 1:
+ r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 1);
+ break;
+ case 2:
+ r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 2);
+ break;
+ case 4:
+ r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 4);
+ break;
+ case 8:
+ r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 8);
+ break;
+ default:
+ r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, size);
+ break;
+ }
+ set_fs(oldfs);
+ return r;
+}
+
+int
+kdba_verify_rw(unsigned long addr, size_t size)
+{
+ unsigned char data[size];
+ return(kdba_getarea_size(data, addr, size) || kdba_putarea_size(addr, data, size));
+}
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_SMP
+
+#include <asm/ipi.h>
+
+gate_desc save_idt[NR_VECTORS];
+
+void kdba_takeover_vector(int vector)
+{
+ memcpy(&save_idt[vector], &idt_table[vector], sizeof(gate_desc));
+ set_intr_gate(KDB_VECTOR, kdb_interrupt);
+ return;
+}
+
+void kdba_giveback_vector(int vector)
+{
+ native_write_idt_entry(idt_table, vector, &save_idt[vector]);
+ return;
+}
+
+/* When first entering KDB, try a normal IPI. That reduces backtrace problems
+ * on the other cpus.
+ */
+void
+smp_kdb_stop(void)
+{
+ if (!KDB_FLAG(NOIPI)) {
+ kdba_takeover_vector(KDB_VECTOR);
+ apic->send_IPI_allbutself(KDB_VECTOR);
+ }
+}
+
+/* The normal KDB IPI handler */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void
+smp_kdb_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ ack_APIC_irq();
+ irq_enter();
+ kdb_ipi(regs, NULL);
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+
+/* Invoked once from kdb_wait_for_cpus when waiting for cpus. For those cpus
+ * that have not responded to the normal KDB interrupt yet, hit them with an
+ * NMI event.
+ */
+void
+kdba_wait_for_cpus(void)
+{
+ int c;
+ if (KDB_FLAG(CATASTROPHIC))
+ return;
+ kdb_printf(" Sending NMI to non-responding cpus: ");
+ for_each_online_cpu(c) {
+ if (kdb_running_process[c].seqno < kdb_seqno - 1) {
+ kdb_printf(" %d", c);
+ apic->send_IPI_mask(cpumask_of(c), NMI_VECTOR);
+ }
+ }
+ kdb_printf(".\n");
+}
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_KDB_KDUMP
+void kdba_kdump_prepare(struct pt_regs *regs)
+{
+ int i;
+ struct pt_regs r;
+ if (regs == NULL)
+ regs = &r;
+
+ for (i = 1; i < NR_CPUS; ++i) {
+ if (!cpu_online(i))
+ continue;
+
+ KDB_STATE_SET_CPU(KEXEC, i);
+ }
+
+ machine_crash_shutdown(regs);
+}
+
+extern void halt_current_cpu(struct pt_regs *);
+
+void kdba_kdump_shutdown_slave(struct pt_regs *regs)
+{
- #ifndef CONFIG_PARAVIRT_XEN
++#ifndef CONFIG_XEN
+ halt_current_cpu(regs);
+#endif /* CONFIG_XEN */
+}
+
+#endif /* CONFIG_KDB_KDUMP */
}
return 0;
}
- #endif
+static int __init force_acpi_rsdt(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of acpi=rsdt\n",
+ d->ident);
+ acpi_rsdt_forced = 1;
+ } else {
+ printk(KERN_NOTICE
+ "Warning: acpi=force overrules DMI blacklist: "
+ "acpi=rsdt\n");
+ }
+ return 0;
+
+}
+
/*
* If your system is blacklisted here, but you find that acpi=force
* works for you, please contact linux-acpi@vger.kernel.org
obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
+obj-$(CONFIG_X86_MCE_XEON75XX) += mce-xeon75xx.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
- obj-$(CONFIG_X86_XEN_MCE) += mce_dom0.o
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
CFI_ENDPROC
END(call_softirq)
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+ CFI_STARTPROC
+ movq %r15, R15(%rdi)
+ movq %r14, R14(%rdi)
+ xchgq %rsi, %rdx
+ movq %r13, R13(%rdi)
+ movq %r12, R12(%rdi)
+ xorl %eax, %eax
+ movq %rbp, RBP(%rdi)
+ movq %rbx, RBX(%rdi)
+ movq (%rsp), %r9
+ xchgq %rdx, %rcx
+ movq %rax, R11(%rdi)
+ movq %rax, R10(%rdi)
+ movq %rax, R9(%rdi)
+ movq %rax, R8(%rdi)
+ movq %rax, RAX(%rdi)
+ movq %rax, RCX(%rdi)
+ movq %rax, RDX(%rdi)
+ movq %rax, RSI(%rdi)
+ movq %rax, RDI(%rdi)
+ movq %rax, ORIG_RAX(%rdi)
+ movq %r9, RIP(%rdi)
+ leaq 8(%rsp), %r9
+ movq $__KERNEL_CS, CS(%rdi)
+ movq %rax, EFLAGS(%rdi)
+ movq %r9, RSP(%rdi)
+ movq $__KERNEL_DS, SS(%rdi)
+ jmpq *%rcx
+ CFI_ENDPROC
+END(arch_unwind_init_running)
+#endif
+
- #ifdef CONFIG_PARAVIRT_XEN
+ #ifdef CONFIG_XEN
zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
/*
CFI_ENDPROC
END(xen_failsafe_callback)
- #endif /* CONFIG_PARAVIRT_XEN */
+ #endif /* CONFIG_XEN */
+#ifdef CONFIG_KDB
+
+#ifdef CONFIG_SMP
+apicinterrupt KDB_VECTOR \
+ kdb_interrupt, smp_kdb_interrupt
+#endif /* CONFIG_SMP */
+
+ENTRY(kdb_call)
+ INTR_FRAME
+ cld
+ pushq $-1 # orig_eax
+ CFI_ADJUST_CFA_OFFSET 8
+ SAVE_ALL
+ movq $1,%rdi # KDB_REASON_ENTER
+ movq $0,%rsi # error_code
+ movq %rsp,%rdx # struct pt_regs
+ call kdb
+ RESTORE_ALL
+ addq $8,%rsp # forget orig_eax
+ CFI_ADJUST_CFA_OFFSET -8
+ iretq
+ CFI_ENDPROC
+END(kdb_call)
+
+#endif /* CONFIG_KDB */
+
+
/*
* Some functions should be protected against kprobes
*/
#include <asm/cacheflush.h>
#include <asm/debugreg.h>
- #ifdef CONFIG_XEN
- #include <xen/interface/kexec.h>
- #endif
-
-static void set_idt(void *newidt, __u16 limit)
-{
- struct desc_ptr curidt;
-
- /* ia32 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- load_idt(&curidt);
-}
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* ia32 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- load_gdt(&curgdt);
-}
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
- __asm__ __volatile__ (
- "\tljmp $"STR(__KERNEL_CS)",$1f\n"
- "\t1:\n"
- "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
- "\tmovl %%eax,%%ds\n"
- "\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
- "\tmovl %%eax,%%ss\n"
- : : : "eax", "memory");
-#undef STR
-#undef __STR
-}
-
static void machine_kexec_free_page_tables(struct kimage *image)
{
free_page((unsigned long)image->arch.pgd);
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
- #if !defined(CONFIG_PARAVIRT_CPU) && !defined(CONFIG_XEN)
-#ifndef CONFIG_PARAVIRT
++#ifndef CONFIG_PARAVIRT_CPU
EXPORT_SYMBOL(native_load_gs_index);
#endif
# This Kconfig describes xen options
#
- config PARAVIRT_XEN
+ config XEN
bool "Xen guest support"
- select PARAVIRT
+ select PARAVIRT_ALL
select PARAVIRT_CLOCK
depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
depends on X86_CMPXCHG && X86_TSC
obj-y += base/ block/ misc/ mfd/
obj-$(CONFIG_NUBUS) += nubus/
obj-y += macintosh/
- obj-$(CONFIG_XEN) += xen/
-obj-$(CONFIG_IDE) += ide/
obj-$(CONFIG_SCSI) += scsi/
obj-$(CONFIG_ATA) += ata/
obj-$(CONFIG_MTD) += mtd/
obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_BLK_DEV_HD) += hd.o
- obj-$(CONFIG_XEN_BLKFRONT) += xen-blkfront.o
+ obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
+obj-$(CONFIG_CIPHER_TWOFISH) += loop_fish2.o
+
swim_mod-objs := swim.o swim_asm.o
{}
};
- static struct of_platform_driver of_fsl_dma_driver = {
- .owner = THIS_MODULE,
- .name = "fsl-elo-dma",
- .match_table = of_fsl_dma_ids,
- .probe = of_fsl_dma_probe,
- .remove = of_fsl_dma_remove,
+ static struct of_platform_driver fsldma_of_driver = {
++ .owner = THIS_MODULE,
+ .name = "fsl-elo-dma",
+ .match_table = fsldma_of_ids,
+ .probe = fsldma_of_probe,
+ .remove = fsldma_of_remove,
};
- static __init int of_fsl_dma_init(void)
+ /*----------------------------------------------------------------------------*/
+ /* Module Init / Exit */
+ /*----------------------------------------------------------------------------*/
+
+ static __init int fsldma_init(void)
{
int ret;
if (!p)
return ERR_PTR(-ENOMEM);
- r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
+ path = shift(as);
- r = dm_get_device(ti, path, ti->begin, ti->len,
- dm_table_get_mode(ti->table), &p->path.dev);
++ r = dm_get_device(ti, path, dm_table_get_mode(ti->table),
+ &p->path.dev);
if (r) {
- ti->error = "error getting device";
- goto bad;
+ unsigned major, minor;
+
+ /* Try to add a failed device */
+ if (r == -ENXIO && sscanf(path, "%u:%u", &major, &minor) == 2) {
+ dev_t dev;
+
+ /* Extract the major/minor numbers */
+ dev = MKDEV(major, minor);
+ if (MAJOR(dev) != major || MINOR(dev) != minor) {
+ /* Nice try, didn't work */
+ DMWARN("Invalid device path %s", path);
+ ti->error = "error converting devnum";
+ goto bad;
+ }
+ DMWARN("adding disabled device %d:%d", major, minor);
+ p->path.dev = NULL;
+ format_dev_t(p->path.pdev, dev);
+ p->is_active = 0;
+ } else {
+ ti->error = "error getting device";
+ goto bad;
+ }
+ } else {
+ memcpy(p->path.pdev, p->path.dev->name, 16);
}
- if (m->hw_handler_name) {
+ if (p->path.dev) {
struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
- r = scsi_dh_attach(q, m->hw_handler_name);
- if (r == -EBUSY) {
- /*
- * Already attached to different hw_handler,
- * try to reattach with correct one.
- */
- scsi_dh_detach(q);
+ if (m->hw_handler_name) {
r = scsi_dh_attach(q, m->hw_handler_name);
- }
-
- if (r < 0) {
- ti->error = "error attaching hardware handler";
- dm_put_device(ti, p->path.dev);
- goto bad;
+ if (r == -EBUSY) {
+ /*
+ * Already attached to different hw_handler,
+ * try to reattach with correct one.
+ */
+ scsi_dh_detach(q);
+ r = scsi_dh_attach(q, m->hw_handler_name);
+ }
+ if (r < 0) {
+ ti->error = "error attaching hardware handler";
+ dm_put_device(ti, p->path.dev);
+ goto bad;
+ }
+ } else {
+ /* Play safe and detach hardware handler */
+ scsi_dh_detach(q);
}
if (m->hw_handler_params) {
errors = 0;
break;
}
- DMERR("Cannot failover device %s because scsi_dh_%s was not "
- "loaded.", pgpath->path.pdev, m->hw_handler_name);
- DMERR("Could not failover the device: Handler scsi_dh_%s "
- "Error %d.", m->hw_handler_name, errors);
++ DMERR("Count not failover device %s: Handler scsi_dh_%s "
++ "was not loaded.", pgpath->path.pdev,
++ m->hw_handler_name);
/*
* Fail path for now, so we do not ping pong
*/
struct pgpath *pgpath =
container_of(work, struct pgpath, activate_path);
- scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
- pg_init_done, pgpath);
+ if (pgpath->path.dev)
+ scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
- pg_init_done, &pgpath->path);
++ pg_init_done, pgpath);
+}
+
+/*
+ * Evaluate scsi return code
+ */
+static int eval_scsi_error(int result, char *sense, int sense_len)
+{
+ struct scsi_sense_hdr sshdr;
+ int r = DM_ENDIO_REQUEUE;
+
+ if (host_byte(result) != DID_OK)
+ return r;
+
+ if (msg_byte(result) != COMMAND_COMPLETE)
+ return r;
+
+ if (status_byte(result) == RESERVATION_CONFLICT)
+ /* Do not retry here, possible data corruption */
+ return -EIO;
+
+#if defined(CONFIG_SCSI) || defined(CONFIG_SCSI_MODULE)
+ if (status_byte(result) == CHECK_CONDITION &&
+ !scsi_normalize_sense(sense, sense_len, &sshdr)) {
+
+ switch (sshdr.sense_key) {
+ case MEDIUM_ERROR:
+ case DATA_PROTECT:
+ case BLANK_CHECK:
+ case COPY_ABORTED:
+ case VOLUME_OVERFLOW:
+ case MISCOMPARE:
+ r = -EIO;
+ break;
+ }
+ }
+#endif
+
+ return r;
}
/*
--- /dev/null
+/*
+ * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
+ *
+ * This file is released under the GPL.
+ *
+ *
+ * Linux 2.6 Device Mapper RAID4 and RAID5 target.
+ *
+ * Supports:
+ * o RAID4 with dedicated and selectable parity device
+ * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
+ * o run time optimization of xor algorithm used to calculate parity
+ *
+ *
+ * Thanks to MD for:
+ * o the raid address calculation algorithm
+ * o the base of the biovec <-> page list copier.
+ *
+ *
+ * Uses region hash to keep track of how many writes are in flight to
+ * regions in order to use dirty log to keep state of regions to recover:
+ *
+ * o clean regions (those which are synchronized
+ * and don't have write io in flight)
+ * o dirty regions (those with write io in flight)
+ *
+ *
+ * On startup, any dirty regions are migrated to the 'nosync' state
+ * and are subject to recovery by the daemon.
+ *
+ * See raid_ctr() for table definition.
+ *
+ *
+ * FIXME:
+ * o add virtual interface for locking
+ * o remove instrumentation (REMOVEME:)
+ *
+ */
+
+static const char *version = "v0.2431";
+
+#include "dm.h"
+#include "dm-memcache.h"
+#include "dm-message.h"
+#include "dm-raid45.h"
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/dm-io.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+/* # of parallel recovered regions */
+/* FIXME: cope with multiple recovery stripes in raid_set struct. */
+#define MAX_RECOVER 1 /* needs to be 1! */
+
+/*
+ * Configurable parameters
+ */
+#define INLINE
+
+/* Default # of stripes if not set in constructor. */
+#define STRIPES 64
+
+/* Minimum/maximum # of selectable stripes. */
+#define STRIPES_MIN 8
+#define STRIPES_MAX 16384
+
+/* Default chunk size in sectors if not set in constructor. */
+#define CHUNK_SIZE 64
+
+/* Default io size in sectors if not set in constructor. */
+#define IO_SIZE_MIN SECTORS_PER_PAGE
+#define IO_SIZE IO_SIZE_MIN
+
+/* Maximum setable chunk size in sectors. */
+#define CHUNK_SIZE_MAX 16384
+
+/* Recover io size default in sectors. */
+#define RECOVER_IO_SIZE_MIN 64
+#define RECOVER_IO_SIZE 256
+
+/* Default percentage recover io bandwidth. */
+#define BANDWIDTH 10
+#define BANDWIDTH_MIN 1
+#define BANDWIDTH_MAX 100
+/*
+ * END Configurable parameters
+ */
+
+#define TARGET "dm-raid45"
+#define DAEMON "kraid45d"
+#define DM_MSG_PREFIX TARGET
+
+#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
+
+/* Amount/size for __xor(). */
+#define SECTORS_PER_XOR SECTORS_PER_PAGE
+#define XOR_SIZE PAGE_SIZE
+
+/* Derive raid_set from stripe_cache pointer. */
+#define RS(x) container_of(x, struct raid_set, sc)
+
+/* Check value in range. */
+#define range_ok(i, min, max) (i >= min && i <= max)
+
+/* Page reference. */
+#define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
+
+/* Bio list reference. */
+#define BL(stripe, p, rw) (stripe->ss[p].bl + rw)
+
+/* Page list reference. */
+#define PL(stripe, p) (stripe->obj[p].pl)
+
+/* Check argument is power of 2. */
+#define POWER_OF_2(a) (!(a & (a - 1)))
+
+/* Factor out to dm-bio-list.h */
+static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
+{
+ bio->bi_next = bl->head;
+ bl->head = bio;
+
+ if (!bl->tail)
+ bl->tail = bio;
+}
+
+/* Factor out to dm.h */
+#define TI_ERR_RET(str, ret) \
+ do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
+#define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
+
+/*-----------------------------------------------------------------
+ * Stripe cache
+ *
+ * Cache for all reads and writes to raid sets (operational or degraded)
+ *
+ * We need to run all data to and from a RAID set through this cache,
+ * because parity chunks need to get calculated from data chunks
+ * or, in the degraded/resynchronization case, missing chunks need
+ * to be reconstructed using the other chunks of the stripe.
+ *---------------------------------------------------------------*/
+/* Protect kmem cache # counter. */
+static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
+
+/* A stripe set (holds bios hanging off). */
+struct stripe_set {
+ struct stripe *stripe; /* Backpointer to stripe for endio(). */
+ struct bio_list bl[3]; /* Reads, writes, and writes merged. */
+#define WRITE_MERGED 2
+};
+
+#if READ != 0 || WRITE != 1
+#error dm-raid45: READ/WRITE != 0/1 used as index!!!
+#endif
+
+/*
+ * Stripe linked list indexes. Keep order, because the stripe
+ * and the stripe cache rely on the first 3!
+ */
+enum list_types {
+ LIST_IO = 0, /* Stripes with io pending. */
+ LIST_ENDIO, /* Stripes to endio. */
+ LIST_LRU, /* Least recently used stripes. */
+ LIST_HASH, /* Hashed stripes. */
+ LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
+ NR_LISTS, /* To size array in struct stripe. */
+};
+
+enum lock_types {
+ LOCK_ENDIO = 0, /* Protect endio list. */
+ LOCK_LRU, /* Protect lru list. */
+ NR_LOCKS, /* To size array in struct stripe_cache. */
+};
+
+/* A stripe: the io object to handle all reads and writes to a RAID set. */
+struct stripe {
+ struct stripe_cache *sc; /* Backpointer to stripe cache. */
+
+ sector_t key; /* Hash key. */
+ region_t region; /* Region stripe is mapped to. */
+
+ /* Reference count. */
+ atomic_t cnt;
+
+ struct {
+ unsigned long flags; /* flags (see below). */
+
+ /*
+ * Pending ios in flight:
+ *
+ * used as a 'lock' to control move of stripe to endio list
+ */
+ atomic_t pending; /* Pending ios in flight. */
+
+ /* Sectors to read and write for multi page stripe sets. */
+ unsigned size;
+ } io;
+
+ /* Lock on stripe (for clustering). */
+ void *lock;
+
+ /*
+ * 4 linked lists:
+ * o io list to flush io
+ * o endio list
+ * o LRU list to put stripes w/o reference count on
+ * o stripe cache hash
+ */
+ struct list_head lists[NR_LISTS];
+
+ struct {
+ unsigned short parity; /* Parity chunk index. */
+ short recover; /* Recovery chunk index. */
+ } idx;
+
+ /* This sets memory cache object (dm-mem-cache). */
+ struct dm_mem_cache_object *obj;
+
+ /* Array of stripe sets (dynamically allocated). */
+ struct stripe_set ss[0];
+};
+
+/* States stripes can be in (flags field). */
+enum stripe_states {
+ STRIPE_ACTIVE, /* Active io on stripe. */
+ STRIPE_ERROR, /* io error on stripe. */
+ STRIPE_MERGED, /* Writes got merged. */
+ STRIPE_READ, /* Read. */
+ STRIPE_RBW, /* Read-before-write. */
+ STRIPE_RECONSTRUCT, /* reconstruct of a missing chunk required. */
+ STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
+};
+
+/* ... and macros to access them. */
+#define BITOPS(name, what, var, flag) \
+static inline int TestClear ## name ## what(struct var *v) \
+{ return test_and_clear_bit(flag, &v->io.flags); } \
+static inline int TestSet ## name ## what(struct var *v) \
+{ return test_and_set_bit(flag, &v->io.flags); } \
+static inline void Clear ## name ## what(struct var *v) \
+{ clear_bit(flag, &v->io.flags); } \
+static inline void Set ## name ## what(struct var *v) \
+{ set_bit(flag, &v->io.flags); } \
+static inline int name ## what(struct var *v) \
+{ return test_bit(flag, &v->io.flags); }
+
+
+BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
+BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
+BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
+BITOPS(Stripe, Read, stripe, STRIPE_READ)
+BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
+BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
+BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
+
+/* A stripe hash. */
+struct stripe_hash {
+ struct list_head *hash;
+ unsigned buckets;
+ unsigned mask;
+ unsigned prime;
+ unsigned shift;
+};
+
+/* A stripe cache. */
+struct stripe_cache {
+ /* Stripe hash. */
+ struct stripe_hash hash;
+
+ /* Stripes with io to flush, stripes to endio and LRU lists. */
+ struct list_head lists[3];
+
+ /* Locks to protect endio and lru lists. */
+ spinlock_t locks[NR_LOCKS];
+
+ /* Slab cache to allocate stripes from. */
+ struct {
+ struct kmem_cache *cache; /* Cache itself. */
+ char name[32]; /* Unique name. */
+ } kc;
+
+ struct dm_io_client *dm_io_client; /* dm-io client resource context. */
+
+ /* dm-mem-cache client resource context. */
+ struct dm_mem_cache_client *mem_cache_client;
+
+ int stripes_parm; /* # stripes parameter from constructor. */
+ atomic_t stripes; /* actual # of stripes in cache. */
+ atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
+ atomic_t stripes_last; /* last # of stripes in cache. */
+ atomic_t active_stripes; /* actual # of active stripes in cache. */
+
+ /* REMOVEME: */
+ atomic_t max_active_stripes; /* actual # of active stripes in cache. */
+};
+
+/* Flag specs for raid_dev */ ;
+enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
+
+/* The raid device in a set. */
+struct raid_dev {
+ struct dm_dev *dev;
+ unsigned long flags; /* raid_dev_flags. */
+ sector_t start; /* offset to map to. */
+};
+
+/* Flags spec for raid_set. */
+enum raid_set_flags {
+ RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
+ RS_DEAD, /* RAID set inoperational. */
+ RS_DEVEL_STATS, /* REMOVEME: display status information. */
+ RS_IO_ERROR, /* io error on set. */
+ RS_RECOVER, /* Do recovery. */
+ RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
+ RS_REGION_GET, /* get a region to recover. */
+ RS_SC_BUSY, /* stripe cache busy -> send an event. */
+ RS_SUSPENDED, /* RAID set suspendedn. */
+};
+
+/* REMOVEME: devel stats counters. */
+enum stats_types {
+ S_BIOS_READ,
+ S_BIOS_ADDED_READ,
+ S_BIOS_ENDIO_READ,
+ S_BIOS_WRITE,
+ S_BIOS_ADDED_WRITE,
+ S_BIOS_ENDIO_WRITE,
+ S_CAN_MERGE,
+ S_CANT_MERGE,
+ S_CONGESTED,
+ S_DM_IO_READ,
+ S_DM_IO_WRITE,
+ S_ACTIVE_READS,
+ S_BANDWIDTH,
+ S_BARRIER,
+ S_BIO_COPY_PL_NEXT,
+ S_DEGRADED,
+ S_DELAYED_BIOS,
+ S_EVICT,
+ S_FLUSHS,
+ S_HITS_1ST,
+ S_IOS_POST,
+ S_INSCACHE,
+ S_MAX_LOOKUP,
+ S_MERGE_PAGE_LOCKED,
+ S_NO_BANDWIDTH,
+ S_NOT_CONGESTED,
+ S_NO_RW,
+ S_NOSYNC,
+ S_PROHIBITPAGEIO,
+ S_RECONSTRUCT_EI,
+ S_RECONSTRUCT_DEV,
+ S_REDO,
+ S_REQUEUE,
+ S_STRIPE_ERROR,
+ S_SUM_DELAYED_BIOS,
+ S_XORS,
+ S_NR_STATS, /* # of stats counters. */
+};
+
+/* Status type -> string mappings. */
+struct stats_map {
+ const enum stats_types type;
+ const char *str;
+};
+
+static struct stats_map stats_map[] = {
+ { S_BIOS_READ, "r=" },
+ { S_BIOS_ADDED_READ, "/" },
+ { S_BIOS_ENDIO_READ, "/" },
+ { S_BIOS_WRITE, " w=" },
+ { S_BIOS_ADDED_WRITE, "/" },
+ { S_BIOS_ENDIO_WRITE, "/" },
+ { S_DM_IO_READ, " rc=" },
+ { S_DM_IO_WRITE, " wc=" },
+ { S_ACTIVE_READS, " active_reads=" },
+ { S_BANDWIDTH, " bandwidth=" },
+ { S_NO_BANDWIDTH, " no_bandwidth=" },
+ { S_BARRIER, " barrier=" },
+ { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
+ { S_CAN_MERGE, " can_merge=" },
+ { S_MERGE_PAGE_LOCKED, "/page_locked=" },
+ { S_CANT_MERGE, "/cant_merge=" },
+ { S_CONGESTED, " congested=" },
+ { S_NOT_CONGESTED, "/not_congested=" },
+ { S_DEGRADED, " degraded=" },
+ { S_DELAYED_BIOS, " delayed_bios=" },
+ { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
+ { S_EVICT, " evict=" },
+ { S_FLUSHS, " flushs=" },
+ { S_HITS_1ST, " hits_1st=" },
+ { S_IOS_POST, " ios_post=" },
+ { S_INSCACHE, " inscache=" },
+ { S_MAX_LOOKUP, " max_lookup=" },
+ { S_NO_RW, " no_rw=" },
+ { S_NOSYNC, " nosync=" },
+ { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
+ { S_RECONSTRUCT_EI, " reconstruct_ei=" },
+ { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
+ { S_REDO, " redo=" },
+ { S_REQUEUE, " requeue=" },
+ { S_STRIPE_ERROR, " stripe_error=" },
+ { S_XORS, " xors=" },
+};
+
+/*
+ * A RAID set.
+ */
+typedef void (*xor_function_t)(unsigned count, unsigned long **data);
+struct raid_set {
+ struct dm_target *ti; /* Target pointer. */
+
+ struct {
+ unsigned long flags; /* State flags. */
+ spinlock_t in_lock; /* Protects central input list below. */
+ struct bio_list in; /* Pending ios (central input list). */
+ struct bio_list work; /* ios work set. */
+ wait_queue_head_t suspendq; /* suspend synchronization. */
+ atomic_t in_process; /* counter of queued bios (suspendq). */
+ atomic_t in_process_max;/* counter of queued bios max. */
+
+ /* io work. */
+ struct workqueue_struct *wq;
+ struct delayed_work dws;
+ } io;
+
+ /* External locking. */
+ struct dm_raid45_locking_type *locking;
+
+ struct stripe_cache sc; /* Stripe cache for this set. */
+
+ /* Xor optimization. */
+ struct {
+ struct xor_func *f;
+ unsigned chunks;
+ unsigned speed;
+ } xor;
+
+ /* Recovery parameters. */
+ struct recover {
+ struct dm_dirty_log *dl; /* Dirty log. */
+ struct dm_region_hash *rh; /* Region hash. */
+
+ /* dm-mem-cache client resource context for recovery stripes. */
+ struct dm_mem_cache_client *mem_cache_client;
+
+ struct list_head stripes; /* List of recovery stripes. */
+
+ region_t nr_regions;
+ region_t nr_regions_to_recover;
+ region_t nr_regions_recovered;
+ unsigned long start_jiffies;
+ unsigned long end_jiffies;
+
+ unsigned bandwidth; /* Recovery bandwidth [%]. */
+ unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
+ unsigned bandwidth_parm; /* " constructor parm. */
+ unsigned io_size; /* io size <= chunk size. */
+ unsigned io_size_parm; /* io size ctr parameter. */
+
+ /* recovery io throttling. */
+ atomic_t io_count[2]; /* counter recover/regular io. */
+ unsigned long last_jiffies;
+
+ struct dm_region *reg; /* Actual region to recover. */
+ sector_t pos; /* Position within region to recover. */
+ sector_t end; /* End of region to recover. */
+ } recover;
+
+ /* RAID set parameters. */
+ struct {
+ struct raid_type *raid_type; /* RAID type (eg, RAID4). */
+ unsigned raid_parms; /* # variable raid parameters. */
+
+ unsigned chunk_size; /* Sectors per chunk. */
+ unsigned chunk_size_parm;
+ unsigned chunk_mask; /* Mask for amount. */
+ unsigned chunk_shift; /* rsector chunk size shift. */
+
+ unsigned io_size; /* Sectors per io. */
+ unsigned io_size_parm;
+ unsigned io_mask; /* Mask for amount. */
+ unsigned io_shift_mask; /* Mask for raid_address(). */
+ unsigned io_shift; /* rsector io size shift. */
+ unsigned pages_per_io; /* Pages per io. */
+
+ sector_t sectors_per_dev; /* Sectors per device. */
+
+ atomic_t failed_devs; /* Amount of devices failed. */
+
+ /* Index of device to initialize. */
+ int dev_to_init;
+ int dev_to_init_parm;
+
+ /* Raid devices dynamically allocated. */
+ unsigned raid_devs; /* # of RAID devices below. */
+ unsigned data_devs; /* # of RAID data devices. */
+
+ int ei; /* index of failed RAID device. */
+
+ /* index of dedicated parity device (i.e. RAID4). */
+ int pi;
+ int pi_parm; /* constructor parm for status output. */
+ } set;
+
+ /* REMOVEME: devel stats counters. */
+ atomic_t stats[S_NR_STATS];
+
+ /* Dynamically allocated temporary pointers for xor(). */
+ unsigned long **data;
+
+ /* Dynamically allocated RAID devices. Alignment? */
+ struct raid_dev dev[0];
+};
+
+
+BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
+BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
+BITOPS(RS, Dead, raid_set, RS_DEAD)
+BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
+BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
+BITOPS(RS, Recover, raid_set, RS_RECOVER)
+BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
+BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
+BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
+#undef BITOPS
+
+#define PageIO(page) PageChecked(page)
+#define AllowPageIO(page) SetPageChecked(page)
+#define ProhibitPageIO(page) ClearPageChecked(page)
+
+/*-----------------------------------------------------------------
+ * Raid-4/5 set structures.
+ *---------------------------------------------------------------*/
+/* RAID level definitions. */
+enum raid_level {
+ raid4,
+ raid5,
+};
+
+/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
+enum raid_algorithm {
+ none,
+ left_asym,
+ right_asym,
+ left_sym,
+ right_sym,
+};
+
+struct raid_type {
+ const char *name; /* RAID algorithm. */
+ const char *descr; /* Descriptor text for logging. */
+ const unsigned parity_devs; /* # of parity devices. */
+ const unsigned minimal_devs; /* minimal # of devices in set. */
+ const enum raid_level level; /* RAID level. */
+ const enum raid_algorithm algorithm; /* RAID algorithm. */
+};
+
+/* Supported raid types and properties. */
+static struct raid_type raid_types[] = {
+ {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
+ {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
+ {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
+ {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
+ {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
+};
+
+/* Address as calculated by raid_address(). */
+struct address {
+ sector_t key; /* Hash key (start address of stripe). */
+ unsigned di, pi; /* Data and parity disks index. */
+};
+
+/* REMOVEME: reset statistics counters. */
+static void stats_reset(struct raid_set *rs)
+{
+ unsigned s = S_NR_STATS;
+
+ while (s--)
+ atomic_set(rs->stats + s, 0);
+}
+
+/*----------------------------------------------------------------
+ * RAID set management routines.
+ *--------------------------------------------------------------*/
+/*
+ * Begin small helper functions.
+ */
+/* Queue (optionally delayed) io work. */
+static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
+{
+ struct delayed_work *dws = &rs->io.dws;
+
+ cancel_delayed_work(dws);
+ queue_delayed_work(rs->io.wq, dws, delay);
+}
+
+/* Queue io work immediately (called from region hash too). */
+static INLINE void wake_do_raid(void *context)
+{
+ wake_do_raid_delayed(context, 0);
+}
+
+/* Wait until all io has been processed. */
+static INLINE void wait_ios(struct raid_set *rs)
+{
+ wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
+}
+
+/* Declare io queued to device. */
+static INLINE void io_dev_queued(struct raid_dev *dev)
+{
+ set_bit(IO_QUEUED, &dev->flags);
+}
+
+/* Io on device and reset ? */
+static inline int io_dev_clear(struct raid_dev *dev)
+{
+ return test_and_clear_bit(IO_QUEUED, &dev->flags);
+}
+
+/* Get an io reference. */
+static INLINE void io_get(struct raid_set *rs)
+{
+ int p = atomic_inc_return(&rs->io.in_process);
+
+ if (p > atomic_read(&rs->io.in_process_max))
+ atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
+}
+
+/* Put the io reference and conditionally wake io waiters. */
+static INLINE void io_put(struct raid_set *rs)
+{
+ /* Intel: rebuild data corrupter? */
+ if (!atomic_read(&rs->io.in_process)) {
+ DMERR("%s would go negative!!!", __func__);
+ return;
+ }
+
+ if (atomic_dec_and_test(&rs->io.in_process))
+ wake_up(&rs->io.suspendq);
+}
+
+/* Calculate device sector offset. */
+static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
+{
+ sector_t sector = bio->bi_sector;
+
+ sector_div(sector, rs->set.data_devs);
+ return sector;
+}
+
+/* Test device operational. */
+static INLINE int dev_operational(struct raid_set *rs, unsigned p)
+{
+ return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
+}
+
+/* Return # of active stripes in stripe cache. */
+static INLINE int sc_active(struct stripe_cache *sc)
+{
+ return atomic_read(&sc->active_stripes);
+}
+
+/* Test io pending on stripe. */
+static INLINE int stripe_io(struct stripe *stripe)
+{
+ return atomic_read(&stripe->io.pending);
+}
+
+static INLINE void stripe_io_inc(struct stripe *stripe)
+{
+ atomic_inc(&stripe->io.pending);
+}
+
+static INLINE void stripe_io_dec(struct stripe *stripe)
+{
+ atomic_dec(&stripe->io.pending);
+}
+
+/* Wrapper needed by for_each_io_dev(). */
+static void _stripe_io_inc(struct stripe *stripe, unsigned p)
+{
+ stripe_io_inc(stripe);
+}
+
+/* Error a stripe. */
+static INLINE void stripe_error(struct stripe *stripe, struct page *page)
+{
+ SetStripeError(stripe);
+ SetPageError(page);
+ atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
+}
+
+/* Page IOed ok. */
+enum dirty_type { CLEAN, DIRTY };
+static INLINE void page_set(struct page *page, enum dirty_type type)
+{
+ switch (type) {
+ case DIRTY:
+ SetPageDirty(page);
+ AllowPageIO(page);
+ break;
+
+ case CLEAN:
+ ClearPageDirty(page);
+ break;
+
+ default:
+ BUG();
+ }
+
+ SetPageUptodate(page);
+ ClearPageError(page);
+}
+
+/* Return region state for a sector. */
+static INLINE int
+region_state(struct raid_set *rs, sector_t sector, unsigned long state)
+{
+ struct dm_region_hash *rh = rs->recover.rh;
+
+ return RSRecover(rs) ?
+ (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
+ state) : 0;
+}
+
+/* Check maximum devices which may fail in a raid set. */
+static inline int raid_set_degraded(struct raid_set *rs)
+{
+ return RSIoError(rs);
+}
+
+/* Check # of devices which may fail in a raid set. */
+static INLINE int raid_set_operational(struct raid_set *rs)
+{
+ /* Too many failed devices -> BAD. */
+ return atomic_read(&rs->set.failed_devs) <=
+ rs->set.raid_type->parity_devs;
+}
+
+/*
+ * Return true in case a page_list should be read/written
+ *
+ * Conditions to read/write:
+ * o 1st page in list not uptodate
+ * o 1st page in list dirty
+ * o if we optimized io away, we flag it using the pages checked bit.
+ */
+static INLINE unsigned page_io(struct page *page)
+{
+ /* Optimization: page was flagged to need io during first run. */
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ return 1;
+ }
+
+ /* Avoid io if prohibited or a locked page. */
+ if (!PageIO(page) || PageLocked(page))
+ return 0;
+
+ if (!PageUptodate(page) || PageDirty(page)) {
+ /* Flag page needs io for second run optimization. */
+ SetPagePrivate(page);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Call a function on each page list needing io. */
+static INLINE unsigned
+for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
+ void (*f_io)(struct stripe *stripe, unsigned p))
+{
+ unsigned p = rs->set.raid_devs, r = 0;
+
+ while (p--) {
+ if (page_io(PAGE(stripe, p))) {
+ f_io(stripe, p);
+ r++;
+ }
+ }
+
+ return r;
+}
+
+/* Reconstruct a particular device ?. */
+static INLINE int dev_to_init(struct raid_set *rs)
+{
+ return rs->set.dev_to_init > -1;
+}
+
+/*
+ * Index of device to calculate parity on.
+ * Either the parity device index *or* the selected device to init
+ * after a spare replacement.
+ */
+static INLINE unsigned dev_for_parity(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+
+ return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
+}
+
+/* Return the index of the device to be recovered. */
+static int idx_get(struct raid_set *rs)
+{
+ /* Avoid to read in the pages to be reconstructed anyway. */
+ if (dev_to_init(rs))
+ return rs->set.dev_to_init;
+ else if (rs->set.raid_type->level == raid4)
+ return rs->set.pi;
+
+ return -1;
+}
+
+/* RAID set congested function. */
+static int raid_set_congested(void *congested_data, int bdi_bits)
+{
+ struct raid_set *rs = congested_data;
+ int r = 0; /* Assume uncongested. */
+ unsigned p = rs->set.raid_devs;
+
+ /* If any of our component devices are overloaded. */
+ while (p--) {
+ struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+
+ r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+ }
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
+ return r;
+}
+
+/* Display RAID set dead message once. */
+static void raid_set_dead(struct raid_set *rs)
+{
+ if (!TestSetRSDead(rs)) {
+ unsigned p;
+ char buf[BDEVNAME_SIZE];
+
+ DMERR("FATAL: too many devices failed -> RAID set dead");
+
+ for (p = 0; p < rs->set.raid_devs; p++) {
+ if (!dev_operational(rs, p))
+ DMERR("device /dev/%s failed",
+ bdevname(rs->dev[p].dev->bdev, buf));
+ }
+ }
+}
+
+/* RAID set degrade check. */
+static INLINE int
+raid_set_check_and_degrade(struct raid_set *rs,
+ struct stripe *stripe, unsigned p)
+{
+ if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
+ return -EPERM;
+
+ /* Through an event in case of member device errors. */
+ dm_table_event(rs->ti->table);
+ atomic_inc(&rs->set.failed_devs);
+
+ /* Only log the first member error. */
+ if (!TestSetRSIoError(rs)) {
+ char buf[BDEVNAME_SIZE];
+
+ /* Store index for recovery. */
+ mb();
+ rs->set.ei = p;
+ mb();
+
+ DMERR("CRITICAL: %sio error on device /dev/%s "
+ "in region=%llu; DEGRADING RAID set",
+ stripe ? "" : "FAKED ",
+ bdevname(rs->dev[p].dev->bdev, buf),
+ (unsigned long long) (stripe ? stripe->key : 0));
+ DMERR("further device error messages suppressed");
+ }
+
+ return 0;
+}
+
+static void
+raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
+{
+ unsigned p = rs->set.raid_devs;
+
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ if (PageError(page)) {
+ ClearPageError(page);
+ raid_set_check_and_degrade(rs, stripe, p);
+ }
+ }
+}
+
+/* RAID set upgrade check. */
+static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
+{
+ if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
+ return -EPERM;
+
+ if (atomic_dec_and_test(&rs->set.failed_devs)) {
+ ClearRSIoError(rs);
+ rs->set.ei = -1;
+ }
+
+ return 0;
+}
+
+/* Lookup a RAID device by name or by major:minor number. */
+union dev_lookup {
+ const char *dev_name;
+ struct raid_dev *dev;
+};
+enum lookup_type { byname, bymajmin, bynumber };
+static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
+ union dev_lookup *dl)
+{
+ unsigned p;
+
+ /*
+ * Must be an incremental loop, because the device array
+ * can have empty slots still on calls from raid_ctr()
+ */
+ for (p = 0; p < rs->set.raid_devs; p++) {
+ char buf[BDEVNAME_SIZE];
+ struct raid_dev *dev = rs->dev + p;
+
+ if (!dev->dev)
+ break;
+
+ /* Format dev string appropriately if necessary. */
+ if (by == byname)
+ bdevname(dev->dev->bdev, buf);
+ else if (by == bymajmin)
+ format_dev_t(buf, dev->dev->bdev->bd_dev);
+
+ /* Do the actual check. */
+ if (by == bynumber) {
+ if (dl->dev->dev->bdev->bd_dev ==
+ dev->dev->bdev->bd_dev)
+ return p;
+ } else if (!strcmp(dl->dev_name, buf))
+ return p;
+ }
+
+ return -ENODEV;
+}
+
+/* End io wrapper. */
+static INLINE void
+_bio_endio(struct raid_set *rs, struct bio *bio, int error)
+{
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
+ S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
+ bio_endio(bio, error);
+ io_put(rs); /* Wake any suspend waiters. */
+}
+
+/*
+ * End small helper functions.
+ */
+
+
+/*
+ * Stripe hash functions
+ */
+/* Initialize/destroy stripe hash. */
+static int hash_init(struct stripe_hash *hash, unsigned stripes)
+{
+ unsigned buckets = 2, max_buckets = stripes / 4;
+ unsigned hash_primes[] = {
+ /* Table of primes for hash_fn/table size optimization. */
+ 3, 7, 13, 27, 53, 97, 193, 389, 769,
+ 1543, 3079, 6151, 12289, 24593,
+ };
+
+ /* Calculate number of buckets (2^^n <= stripes / 4). */
+ while (buckets < max_buckets)
+ buckets <<= 1;
+
+ /* Allocate stripe hash. */
+ hash->hash = vmalloc(buckets * sizeof(*hash->hash));
+ if (!hash->hash)
+ return -ENOMEM;
+
+ hash->buckets = buckets;
+ hash->mask = buckets - 1;
+ hash->shift = ffs(buckets);
+ if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
+ hash->shift = ARRAY_SIZE(hash_primes) + 1;
+
+ BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
+ hash->prime = hash_primes[hash->shift - 2];
+
+ /* Initialize buckets. */
+ while (buckets--)
+ INIT_LIST_HEAD(hash->hash + buckets);
+
+ return 0;
+}
+
+static INLINE void hash_exit(struct stripe_hash *hash)
+{
+ if (hash->hash) {
+ vfree(hash->hash);
+ hash->hash = NULL;
+ }
+}
+
+/* List add (head/tail/locked/unlocked) inlines. */
+enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
+#define LIST_DEL(name, list) \
+static void stripe_ ## name ## _del(struct stripe *stripe, \
+ enum list_lock_type lock) { \
+ struct list_head *lh = stripe->lists + (list); \
+ spinlock_t *l = NULL; \
+\
+ if (lock == LIST_LOCKED) { \
+ l = stripe->sc->locks + LOCK_LRU; \
+ spin_lock_irq(l); \
+ } \
+\
+\
+ if (!list_empty(lh)) \
+ list_del_init(lh); \
+\
+ if (lock == LIST_LOCKED) \
+ spin_unlock_irq(l); \
+}
+
+LIST_DEL(hash, LIST_HASH)
+LIST_DEL(lru, LIST_LRU)
+#undef LIST_DEL
+
+enum list_pos_type { POS_HEAD, POS_TAIL };
+#define LIST_ADD(name, list) \
+static void stripe_ ## name ## _add(struct stripe *stripe, \
+ enum list_pos_type pos, \
+ enum list_lock_type lock) { \
+ struct list_head *lh = stripe->lists + (list); \
+ struct stripe_cache *sc = stripe->sc; \
+ spinlock_t *l = NULL; \
+\
+ if (lock == LIST_LOCKED) { \
+ l = sc->locks + LOCK_LRU; \
+ spin_lock_irq(l); \
+ } \
+\
+ if (list_empty(lh)) { \
+ if (pos == POS_HEAD) \
+ list_add(lh, sc->lists + (list)); \
+ else \
+ list_add_tail(lh, sc->lists + (list)); \
+ } \
+\
+ if (lock == LIST_LOCKED) \
+ spin_unlock_irq(l); \
+}
+
+LIST_ADD(endio, LIST_ENDIO)
+LIST_ADD(io, LIST_IO)
+LIST_ADD(lru, LIST_LRU)
+#undef LIST_ADD
+
+#define POP(list) \
+ do { \
+ if (list_empty(sc->lists + list)) \
+ stripe = NULL; \
+ else { \
+ stripe = list_first_entry(&sc->lists[list], \
+ struct stripe, \
+ lists[list]); \
+ list_del_init(&stripe->lists[list]); \
+ } \
+ } while (0);
+
+/* Pop an available stripe off the lru list. */
+static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
+{
+ struct stripe *stripe;
+ spinlock_t *lock = sc->locks + LOCK_LRU;
+
+ spin_lock_irq(lock);
+ POP(LIST_LRU);
+ spin_unlock_irq(lock);
+
+ if (stripe)
+ /* Remove from hash before reuse. */
+ stripe_hash_del(stripe, LIST_UNLOCKED);
+
+ return stripe;
+}
+
+static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
+{
+ return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
+}
+
+static inline struct list_head *
+hash_bucket(struct stripe_hash *hash, sector_t key)
+{
+ return hash->hash + hash_fn(hash, key);
+}
+
+/* Insert an entry into a hash. */
+static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
+{
+ list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
+}
+
+/* Insert an entry into the stripe hash. */
+static inline void
+sc_insert(struct stripe_cache *sc, struct stripe *stripe)
+{
+ hash_insert(&sc->hash, stripe);
+}
+
+/* Lookup an entry in the stripe hash. */
+static inline struct stripe *
+stripe_lookup(struct stripe_cache *sc, sector_t key)
+{
+ unsigned c = 0;
+ struct stripe *stripe;
+ struct list_head *bucket = hash_bucket(&sc->hash, key);
+
+ list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
+ /* REMOVEME: statisics. */
+ if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
+ atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
+
+ if (stripe->key == key)
+ return stripe;
+ }
+
+ return NULL;
+}
+
+/* Resize the stripe cache hash on size changes. */
+static int hash_resize(struct stripe_cache *sc)
+{
+ /* Resize threshold reached? */
+ if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
+ || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
+ int r;
+ struct stripe_hash hash, hash_tmp;
+ spinlock_t *lock;
+
+ r = hash_init(&hash, atomic_read(&sc->stripes));
+ if (r)
+ return r;
+
+ lock = sc->locks + LOCK_LRU;
+ spin_lock_irq(lock);
+ if (sc->hash.hash) {
+ unsigned b = sc->hash.buckets;
+ struct list_head *pos, *tmp;
+
+ /* Walk old buckets and insert into new. */
+ while (b--) {
+ list_for_each_safe(pos, tmp, sc->hash.hash + b)
+ hash_insert(&hash,
+ list_entry(pos, struct stripe,
+ lists[LIST_HASH]));
+ }
+
+ }
+
+ memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
+ memcpy(&sc->hash, &hash, sizeof(sc->hash));
+ atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
+ spin_unlock_irq(lock);
+
+ hash_exit(&hash_tmp);
+ }
+
+ return 0;
+}
+
+/*
+ * Stripe cache locking functions
+ */
+/* Dummy lock function for local RAID4+5. */
+static void *no_lock(sector_t key, enum dm_lock_type type)
+{
+ return &no_lock;
+}
+
+/* Dummy unlock function for local RAID4+5. */
+static void no_unlock(void *lock_handle)
+{
+}
+
+/* No locking (for local RAID 4+5). */
+static struct dm_raid45_locking_type locking_none = {
+ .lock = no_lock,
+ .unlock = no_unlock,
+};
+
+/* Clustered RAID 4+5. */
+/* FIXME: code this. */
+static struct dm_raid45_locking_type locking_cluster = {
+ .lock = no_lock,
+ .unlock = no_unlock,
+};
+
+/* Lock a stripe (for clustering). */
+static int
+stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
+{
+ stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
+ DM_RAID45_EX);
+ return stripe->lock ? 0 : -EPERM;
+}
+
+/* Unlock a stripe (for clustering). */
+static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
+{
+ rs->locking->unlock(stripe->lock);
+ stripe->lock = NULL;
+}
+
+/*
+ * Stripe cache functions.
+ */
+/*
+ * Invalidate all page lists pages of a stripe.
+ *
+ * I only keep state for the whole list in the first page.
+ */
+static INLINE void
+stripe_pages_invalidate(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ ProhibitPageIO(page);
+ ClearPageChecked(page);
+ ClearPageDirty(page);
+ ClearPageError(page);
+ __clear_page_locked(page);
+ ClearPagePrivate(page);
+ ClearPageUptodate(page);
+ }
+}
+
+/* Prepare stripe for (re)use. */
+static INLINE void stripe_invalidate(struct stripe *stripe)
+{
+ stripe->io.flags = 0;
+ stripe_pages_invalidate(stripe);
+}
+
+/* Allow io on all chunks of a stripe. */
+static INLINE void stripe_allow_io(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--)
+ AllowPageIO(PAGE(stripe, p));
+}
+
+/* Initialize a stripe. */
+static void
+stripe_init(struct stripe_cache *sc, struct stripe *stripe)
+{
+ unsigned p = RS(sc)->set.raid_devs;
+ unsigned i;
+
+ /* Work all io chunks. */
+ while (p--) {
+ struct stripe_set *ss = stripe->ss + p;
+
+ stripe->obj[p].private = ss;
+ ss->stripe = stripe;
+
+ i = ARRAY_SIZE(ss->bl);
+ while (i--)
+ bio_list_init(ss->bl + i);
+ }
+
+ stripe->sc = sc;
+
+ i = ARRAY_SIZE(stripe->lists);
+ while (i--)
+ INIT_LIST_HEAD(stripe->lists + i);
+
+ atomic_set(&stripe->cnt, 0);
+ atomic_set(&stripe->io.pending, 0);
+
+ stripe_invalidate(stripe);
+}
+
+/* Number of pages per chunk. */
+static inline unsigned chunk_pages(unsigned io_size)
+{
+ return dm_div_up(io_size, SECTORS_PER_PAGE);
+}
+
+/* Number of pages per stripe. */
+static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
+{
+ return chunk_pages(io_size) * rs->set.raid_devs;
+}
+
+/* Initialize part of page_list (recovery). */
+static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
+ unsigned start, unsigned count)
+{
+ unsigned pages = chunk_pages(count);
+ /* Get offset into the page_list. */
+ struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
+
+ BUG_ON(!pl);
+ while (pl && pages--) {
+ BUG_ON(!pl->page);
+ memset(page_address(pl->page), 0, PAGE_SIZE);
+ pl = pl->next;
+ }
+}
+
+/* Initialize parity chunk of stripe. */
+static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
+{
+ stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
+}
+
+/* Return dynamic stripe structure size. */
+static INLINE size_t stripe_size(struct raid_set *rs)
+{
+ return sizeof(struct stripe) +
+ rs->set.raid_devs * sizeof(struct stripe_set);
+}
+
+/* Allocate a stripe and its memory object. */
+/* XXX adjust to cope with stripe cache and recovery stripe caches. */
+enum grow { SC_GROW, SC_KEEP };
+static struct stripe *stripe_alloc(struct stripe_cache *sc,
+ struct dm_mem_cache_client *mc,
+ enum grow grow)
+{
+ int r;
+ struct stripe *stripe;
+
+ stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
+ if (stripe) {
+ /* Grow the dm-mem-cache by one object. */
+ if (grow == SC_GROW) {
+ r = dm_mem_cache_grow(mc, 1);
+ if (r)
+ goto err_free;
+ }
+
+ stripe->obj = dm_mem_cache_alloc(mc);
+ if (!stripe->obj)
+ goto err_shrink;
+
+ stripe_init(sc, stripe);
+ }
+
+ return stripe;
+
+err_shrink:
+ if (grow == SC_GROW)
+ dm_mem_cache_shrink(mc, 1);
+err_free:
+ kmem_cache_free(sc->kc.cache, stripe);
+ return NULL;
+}
+
+/*
+ * Free a stripes memory object, shrink the
+ * memory cache and free the stripe itself
+ */
+static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
+{
+ dm_mem_cache_free(mc, stripe->obj);
+ dm_mem_cache_shrink(mc, 1);
+ kmem_cache_free(stripe->sc->kc.cache, stripe);
+}
+
+/* Free the recovery stripe. */
+static void stripe_recover_free(struct raid_set *rs)
+{
+ struct recover *rec = &rs->recover;
+ struct list_head *stripes = &rec->stripes;
+
+ while (!list_empty(stripes)) {
+ struct stripe *stripe = list_first_entry(stripes, struct stripe,
+ lists[LIST_RECOVER]);
+ list_del(stripe->lists + LIST_RECOVER);
+ stripe_free(stripe, rec->mem_cache_client);
+ }
+}
+
+/* Push a stripe safely onto the endio list to be handled by do_endios(). */
+static INLINE void stripe_endio_push(struct stripe *stripe)
+{
+ int wake;
+ unsigned long flags;
+ struct stripe_cache *sc = stripe->sc;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ spin_lock_irqsave(lock, flags);
+ wake = list_empty(sc->lists + LIST_ENDIO);
+ stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
+ spin_unlock_irqrestore(lock, flags);
+
+ if (wake)
+ wake_do_raid(RS(sc));
+}
+
+/* Protected check for stripe cache endio list empty. */
+static INLINE int stripe_endio_empty(struct stripe_cache *sc)
+{
+ int r;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ spin_lock_irq(lock);
+ r = list_empty(sc->lists + LIST_ENDIO);
+ spin_unlock_irq(lock);
+
+ return r;
+}
+
+/* Pop a stripe off safely off the endio list. */
+static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
+{
+ struct stripe *stripe;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ /* This runs in parallel with endio(). */
+ spin_lock_irq(lock);
+ POP(LIST_ENDIO)
+ spin_unlock_irq(lock);
+ return stripe;
+}
+
+#undef POP
+
+/* Evict stripe from cache. */
+static void stripe_evict(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
+
+ if (list_empty(stripe->lists + LIST_LRU)) {
+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
+ }
+}
+
+/* Grow stripe cache. */
+static int
+sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
+{
+ int r = 0;
+ struct raid_set *rs = RS(sc);
+
+ /* Try to allocate this many (additional) stripes. */
+ while (stripes--) {
+ struct stripe *stripe =
+ stripe_alloc(sc, sc->mem_cache_client, grow);
+
+ if (likely(stripe)) {
+ stripe->io.size = rs->set.io_size;
+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ atomic_inc(&sc->stripes);
+ } else {
+ r = -ENOMEM;
+ break;
+ }
+ }
+
+ ClearRSScBusy(rs);
+ return r ? r : hash_resize(sc);
+}
+
+/* Shrink stripe cache. */
+static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
+{
+ int r = 0;
+
+ /* Try to get unused stripe from LRU list. */
+ while (stripes--) {
+ struct stripe *stripe;
+
+ stripe = stripe_lru_pop(sc);
+ if (stripe) {
+ /* An lru stripe may never have ios pending! */
+ BUG_ON(stripe_io(stripe));
+ stripe_free(stripe, sc->mem_cache_client);
+ atomic_dec(&sc->stripes);
+ } else {
+ r = -ENOENT;
+ break;
+ }
+ }
+
+ /* Check if stats are still sane. */
+ if (atomic_read(&sc->max_active_stripes) >
+ atomic_read(&sc->stripes))
+ atomic_set(&sc->max_active_stripes, 0);
+
+ if (r)
+ return r;
+
+ ClearRSScBusy(RS(sc));
+ return hash_resize(sc);
+}
+
+/* Create stripe cache. */
+static int sc_init(struct raid_set *rs, unsigned stripes)
+{
+ unsigned i, nr;
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+ struct recover *rec = &rs->recover;
+
+ /* Initialize lists and locks. */
+ i = ARRAY_SIZE(sc->lists);
+ while (i--)
+ INIT_LIST_HEAD(sc->lists + i);
+
+ i = NR_LOCKS;
+ while (i--)
+ spin_lock_init(sc->locks + i);
+
+ /* Initialize atomic variables. */
+ atomic_set(&sc->stripes, 0);
+ atomic_set(&sc->stripes_last, 0);
+ atomic_set(&sc->stripes_to_shrink, 0);
+ atomic_set(&sc->active_stripes, 0);
+ atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
+
+ /*
+ * We need a runtime unique # to suffix the kmem cache name
+ * because we'll have one for each active RAID set.
+ */
+ nr = atomic_inc_return(&_stripe_sc_nr);
+ sprintf(sc->kc.name, "%s_%d", TARGET, nr);
+ sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
+ 0, 0, NULL);
+ if (!sc->kc.cache)
+ return -ENOMEM;
+
+ /* Create memory cache client context for RAID stripe cache. */
+ sc->mem_cache_client =
+ dm_mem_cache_client_create(stripes, rs->set.raid_devs,
+ chunk_pages(rs->set.io_size));
+ if (IS_ERR(sc->mem_cache_client))
+ return PTR_ERR(sc->mem_cache_client);
+
+ /* Create memory cache client context for RAID recovery stripe(s). */
+ rec->mem_cache_client =
+ dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
+ chunk_pages(rec->io_size));
+ if (IS_ERR(rec->mem_cache_client))
+ return PTR_ERR(rec->mem_cache_client);
+
+ /* Allocate stripe for set recovery. */
+ /* XXX: cope with MAX_RECOVERY. */
+ INIT_LIST_HEAD(&rec->stripes);
+ for (i = 0; i < MAX_RECOVER; i++) {
+ stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
+ if (!stripe)
+ return -ENOMEM;
+
+ SetStripeRecover(stripe);
+ stripe->io.size = rec->io_size;
+ list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
+ }
+
+ /*
+ * Allocate the stripe objetcs from the
+ * cache and add them to the LRU list.
+ */
+ return sc_grow(sc, stripes, SC_KEEP);
+}
+
+/* Destroy the stripe cache. */
+static void sc_exit(struct stripe_cache *sc)
+{
+ if (sc->kc.cache) {
+ BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
+ kmem_cache_destroy(sc->kc.cache);
+ }
+
+ if (sc->mem_cache_client)
+ dm_mem_cache_client_destroy(sc->mem_cache_client);
+
+ ClearRSRecover(RS(sc));
+ stripe_recover_free(RS(sc));
+ if (RS(sc)->recover.mem_cache_client)
+ dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
+
+ hash_exit(&sc->hash);
+}
+
+/*
+ * Calculate RAID address
+ *
+ * Delivers tuple with the index of the data disk holding the chunk
+ * in the set, the parity disks index and the start of the stripe
+ * within the address space of the set (used as the stripe cache hash key).
+ */
+/* thx MD. */
+static struct address *
+raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
+{
+ unsigned data_devs = rs->set.data_devs, di, pi,
+ raid_devs = rs->set.raid_devs;
+ sector_t stripe, tmp;
+
+ /*
+ * chunk_number = sector / chunk_size
+ * stripe = chunk_number / data_devs
+ * di = stripe % data_devs;
+ */
+ stripe = sector >> rs->set.chunk_shift;
+ di = sector_div(stripe, data_devs);
+
+ switch (rs->set.raid_type->level) {
+ case raid5:
+ tmp = stripe;
+ pi = sector_div(tmp, raid_devs);
+
+ switch (rs->set.raid_type->algorithm) {
+ case left_asym: /* Left asymmetric. */
+ pi = data_devs - pi;
+ case right_asym: /* Right asymmetric. */
+ if (di >= pi)
+ di++;
+ break;
+
+ case left_sym: /* Left symmetric. */
+ pi = data_devs - pi;
+ case right_sym: /* Right symmetric. */
+ di = (pi + di + 1) % raid_devs;
+ break;
+
+ default:
+ DMERR("Unknown RAID algorithm %d",
+ rs->set.raid_type->algorithm);
+ goto out;
+ }
+
+ break;
+
+ case raid4:
+ pi = rs->set.pi;
+ if (di >= pi)
+ di++;
+ break;
+
+ default:
+ DMERR("Unknown RAID level %d", rs->set.raid_type->level);
+ goto out;
+ }
+
+ /*
+ * Hash key = start offset on any single device of the RAID set;
+ * adjusted in case io size differs from chunk size.
+ */
+ addr->key = (stripe << rs->set.chunk_shift) +
+ (sector & rs->set.io_shift_mask);
+ addr->di = di;
+ addr->pi = pi;
+
+out:
+ return addr;
+}
+
+/*
+ * Copy data across between stripe pages and bio vectors.
+ *
+ * Pay attention to data alignment in stripe and bio pages.
+ */
+static void
+bio_copy_page_list(int rw, struct stripe *stripe,
+ struct page_list *pl, struct bio *bio)
+{
+ unsigned i, page_offset;
+ void *page_addr;
+ struct raid_set *rs = RS(stripe->sc);
+ struct bio_vec *bv;
+
+ /* Get start page in page list for this sector. */
+ i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
+ pl = pl_elem(pl, i);
+
+ page_addr = page_address(pl->page);
+ page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
+
+ /* Walk all segments and copy data across between bio_vecs and pages. */
+ bio_for_each_segment(bv, bio, i) {
+ int len = bv->bv_len, size;
+ unsigned bio_offset = 0;
+ void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
+redo:
+ size = (page_offset + len > PAGE_SIZE) ?
+ PAGE_SIZE - page_offset : len;
+
+ if (rw == READ)
+ memcpy(bio_addr + bio_offset,
+ page_addr + page_offset, size);
+ else
+ memcpy(page_addr + page_offset,
+ bio_addr + bio_offset, size);
+
+ page_offset += size;
+ if (page_offset == PAGE_SIZE) {
+ /*
+ * We reached the end of the chunk page ->
+ * need refer to the next one to copy more data.
+ */
+ len -= size;
+ if (len) {
+ /* Get next page. */
+ pl = pl->next;
+ BUG_ON(!pl);
+ page_addr = page_address(pl->page);
+ page_offset = 0;
+ bio_offset += size;
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
+ goto redo;
+ }
+ }
+
+ __bio_kunmap_atomic(bio_addr, KM_USER0);
+ }
+}
+
+/*
+ * Xor optimization macros.
+ */
+/* Xor data pointer declaration and initialization macros. */
+#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
+#define DECLARE_3 DECLARE_2, *d2 = data[2]
+#define DECLARE_4 DECLARE_3, *d3 = data[3]
+#define DECLARE_5 DECLARE_4, *d4 = data[4]
+#define DECLARE_6 DECLARE_5, *d5 = data[5]
+#define DECLARE_7 DECLARE_6, *d6 = data[6]
+#define DECLARE_8 DECLARE_7, *d7 = data[7]
+
+/* Xor unrole macros. */
+#define D2(n) d0[n] = d0[n] ^ d1[n]
+#define D3(n) D2(n) ^ d2[n]
+#define D4(n) D3(n) ^ d3[n]
+#define D5(n) D4(n) ^ d4[n]
+#define D6(n) D5(n) ^ d5[n]
+#define D7(n) D6(n) ^ d6[n]
+#define D8(n) D7(n) ^ d7[n]
+
+#define X_2(macro, offset) macro(offset); macro(offset + 1);
+#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
+#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
+#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
+#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
+#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
+
+/* Define a _xor_#chunks_#xors_per_run() function. */
+#define _XOR(chunks, xors_per_run) \
+static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
+{ \
+ unsigned end = XOR_SIZE / sizeof(data[0]), i; \
+ DECLARE_ ## chunks; \
+\
+ for (i = 0; i < end; i += xors_per_run) { \
+ X_ ## xors_per_run(D ## chunks, i); \
+ } \
+}
+
+/* Define xor functions for 2 - 8 chunks. */
+#define MAKE_XOR_PER_RUN(xors_per_run) \
+ _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
+ _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
+ _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
+ _XOR(8, xors_per_run);
+
+MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
+MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
+MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
+MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
+
+#define MAKE_XOR(xors_per_run) \
+struct { \
+ void (*f)(unsigned long **); \
+} static xor_funcs ## xors_per_run[] = { \
+ { NULL }, \
+ { NULL }, \
+ { _xor2_ ## xors_per_run }, \
+ { _xor3_ ## xors_per_run }, \
+ { _xor4_ ## xors_per_run }, \
+ { _xor5_ ## xors_per_run }, \
+ { _xor6_ ## xors_per_run }, \
+ { _xor7_ ## xors_per_run }, \
+ { _xor8_ ## xors_per_run }, \
+}; \
+\
+static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
+{ \
+ /* Call respective function for amount of chunks. */ \
+ xor_funcs ## xors_per_run[n].f(data); \
+}
+
+/* Define xor_8() - xor_64 functions. */
+MAKE_XOR(8)
+MAKE_XOR(16)
+MAKE_XOR(32)
+MAKE_XOR(64)
+
+/* Maximum number of chunks, which can be xor'ed in one go. */
+#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
+
+struct xor_func {
+ xor_function_t f;
+ const char *name;
+} static xor_funcs[] = {
+ {xor_8, "xor_8"},
+ {xor_16, "xor_16"},
+ {xor_32, "xor_32"},
+ {xor_64, "xor_64"},
+};
+
+/*
+ * Calculate crc.
+ *
+ * This indexes into the page list of the stripe.
+ *
+ * All chunks will be xored into the parity chunk
+ * in maximum groups of xor.chunks.
+ *
+ * FIXME: try mapping the pages on discontiguous memory.
+ */
+static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned max_chunks = rs->xor.chunks, n, p;
+ unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
+ unsigned long **d = rs->data;
+ xor_function_t xor_f = rs->xor.f->f;
+
+ /* Address of parity page to xor into. */
+ d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
+
+ /* Preset pointers to data pages. */
+ for (n = 1, p = rs->set.raid_devs; p--; ) {
+ if (p != pi && PageIO(PAGE(stripe, p)))
+ d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
+
+ /* If max chunks -> xor .*/
+ if (n == max_chunks) {
+ xor_f(n, d);
+ n = 1;
+ }
+ }
+
+ /* If chunks -> xor. */
+ if (n > 1)
+ xor_f(n, d);
+
+ /* Set parity page uptodate and clean. */
+ page_set(PAGE(stripe, pi), CLEAN);
+}
+
+/* Common xor loop through all stripe page lists. */
+static void common_xor(struct stripe *stripe, sector_t count,
+ unsigned off, unsigned p)
+{
+ unsigned sector;
+
+ for (sector = off; sector < count; sector += SECTORS_PER_XOR)
+ xor(stripe, p, sector);
+
+ atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
+}
+
+/*
+ * Calculate parity sectors on intact stripes.
+ *
+ * Need to calculate raid address for recover stripe, because its
+ * chunk sizes differs and is typically larger than io chunk size.
+ */
+static void parity_xor(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned chunk_size = rs->set.chunk_size,
+ io_size = stripe->io.size,
+ xor_size = chunk_size > io_size ? io_size : chunk_size;
+ sector_t off;
+
+ /* This can be the recover stripe with a larger io size. */
+ for (off = 0; off < io_size; off += xor_size) {
+ unsigned pi;
+
+ /*
+ * Recover stripe likely is bigger than regular io
+ * ones and has no precalculated parity disk index ->
+ * need to calculate RAID address.
+ */
+ if (unlikely(StripeRecover(stripe))) {
+ struct address addr;
+
+ raid_address(rs,
+ (stripe->key + off) * rs->set.data_devs,
+ &addr);
+ pi = addr.pi;
+ stripe_zero_pl_part(stripe, pi, off,
+ rs->set.chunk_size);
+ } else
+ pi = stripe->idx.parity;
+
+ common_xor(stripe, xor_size, off, pi);
+ page_set(PAGE(stripe, pi), DIRTY);
+ }
+}
+
+/* Reconstruct missing chunk. */
+static void reconstruct_xor(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int p = stripe->idx.recover;
+
+ BUG_ON(p < 0);
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (raid_set_degraded(rs) ?
+ S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
+
+ /* Zero chunk to be reconstructed. */
+ stripe_zero_chunk(stripe, p);
+ common_xor(stripe, stripe->io.size, 0, p);
+}
+
+/*
+ * Try getting a stripe either from the hash or from the lru list
+ */
+static inline void _stripe_get(struct stripe *stripe)
+{
+ atomic_inc(&stripe->cnt);
+}
+
+static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
+{
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+
+ stripe = stripe_lookup(sc, addr->key);
+ if (stripe) {
+ _stripe_get(stripe);
+ /* Remove from the lru list if on. */
+ stripe_lru_del(stripe, LIST_LOCKED);
+ atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
+ } else {
+ /* Second try to get an LRU stripe. */
+ stripe = stripe_lru_pop(sc);
+ if (stripe) {
+ _stripe_get(stripe);
+ /* Invalidate before reinserting with changed key. */
+ stripe_invalidate(stripe);
+ stripe->key = addr->key;
+ stripe->region = dm_rh_sector_to_region(rs->recover.rh,
+ addr->key);
+ stripe->idx.parity = addr->pi;
+ sc_insert(sc, stripe);
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_INSCACHE);
+ }
+ }
+
+ return stripe;
+}
+
+/*
+ * Decrement reference count on a stripe.
+ *
+ * Move it to list of LRU stripes if zero.
+ */
+static void stripe_put(struct stripe *stripe)
+{
+ if (atomic_dec_and_test(&stripe->cnt)) {
+ if (TestClearStripeActive(stripe))
+ atomic_dec(&stripe->sc->active_stripes);
+
+ /* Put stripe onto the LRU list. */
+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ }
+
+ BUG_ON(atomic_read(&stripe->cnt) < 0);
+}
+
+/*
+ * Process end io
+ *
+ * I need to do it here because I can't in interrupt
+ *
+ * Read and write functions are split in order to avoid
+ * conditionals in the main loop for performamce reasons.
+ */
+
+/* Helper read bios on a page list. */
+static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
+ struct bio *bio)
+{
+ bio_copy_page_list(READ, stripe, pl, bio);
+}
+
+/* Helper write bios on a page list. */
+static void _rh_dec(struct stripe *stripe, struct page_list *pl,
+ struct bio *bio)
+{
+ dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
+}
+
+/* End io all bios on a page list. */
+static inline int
+page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
+{
+ int r = 0;
+ struct bio_list *bl = BL(stripe, p, rw);
+
+ if (!bio_list_empty(bl)) {
+ struct page_list *pl = PL(stripe, p);
+ struct page *page = pl->page;
+
+ if (PageLocked(page))
+ r = -EBUSY;
+ /*
+ * FIXME: PageUptodate() not cleared
+ * properly for missing chunks ?
+ */
+ else if (PageUptodate(page)) {
+ struct bio *bio;
+ struct raid_set *rs = RS(stripe->sc);
+ void (*h_f)(struct stripe *, struct page_list *,
+ struct bio *) =
+ (rw == READ) ? _bio_copy_page_list : _rh_dec;
+
+ while ((bio = bio_list_pop(bl))) {
+ h_f(stripe, pl, bio);
+ _bio_endio(rs, bio, 0);
+ stripe_put(stripe);
+ if (count)
+ (*count)++;
+ }
+ } else
+ r = -EAGAIN;
+ }
+
+ return r;
+}
+
+/*
+ * End io all reads/writes on a stripe copying
+ * read date accross from stripe to bios.
+ */
+static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
+{
+ int r = 0;
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--) {
+ int rr = page_list_endio(rw, stripe, p, count);
+
+ if (rr && r != -EIO)
+ r = rr;
+ }
+
+ return r;
+}
+
+/* Fail all ios on a bio list and return # of bios. */
+static unsigned
+bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
+{
+ unsigned r;
+ struct bio *bio;
+
+ raid_set_dead(rs);
+
+ /* Update region counters. */
+ if (stripe) {
+ struct dm_region_hash *rh = rs->recover.rh;
+
+ bio_list_for_each(bio, bl) {
+ if (bio_data_dir(bio) == WRITE)
+ dm_rh_dec(rh, stripe->region);
+ }
+ }
+
+ /* Error end io all bios. */
+ for (r = 0; (bio = bio_list_pop(bl)); r++)
+ _bio_endio(rs, bio, -EIO);
+
+ return r;
+}
+
+/* Fail all ios of a bio list of a stripe and drop io pending count. */
+static void
+stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
+ struct bio_list *bl)
+{
+ unsigned put = bio_list_fail(rs, stripe, bl);
+
+ while (put--)
+ stripe_put(stripe);
+}
+
+/* Fail all ios hanging off all bio lists of a stripe. */
+static void stripe_fail_io(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned p = rs->set.raid_devs;
+
+ stripe_evict(stripe);
+
+ while (p--) {
+ struct stripe_set *ss = stripe->ss + p;
+ int i = ARRAY_SIZE(ss->bl);
+
+ while (i--)
+ stripe_bio_list_fail(rs, stripe, ss->bl + i);
+ }
+}
+
+/*
+ * Handle all stripes by handing them to the daemon, because we can't
+ * map their pages to copy the data in interrupt context.
+ *
+ * We don't want to handle them here either, while interrupts are disabled.
+ */
+
+/* Read/write endio function for dm-io (interrupt context). */
+static void endio(unsigned long error, void *context)
+{
+ struct dm_mem_cache_object *obj = context;
+ struct stripe_set *ss = obj->private;
+ struct stripe *stripe = ss->stripe;
+ struct page *page = obj->pl->page;
+
+ if (unlikely(error))
+ stripe_error(stripe, page);
+ else
+ page_set(page, CLEAN);
+
+ __clear_page_locked(page);
+ stripe_io_dec(stripe);
+
+ /* Add stripe to endio list and wake daemon. */
+ stripe_endio_push(stripe);
+}
+
+/*
+ * Recovery io throttling
+ */
+/* Conditionally reset io counters. */
+enum count_type { IO_WORK = 0, IO_RECOVER };
+static int recover_io_reset(struct raid_set *rs)
+{
+ unsigned long j = jiffies;
+
+ /* Pay attention to jiffies overflows. */
+ if (j > rs->recover.last_jiffies + HZ
+ || j < rs->recover.last_jiffies) {
+ rs->recover.last_jiffies = j;
+ atomic_set(rs->recover.io_count + IO_WORK, 0);
+ atomic_set(rs->recover.io_count + IO_RECOVER, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Count ios. */
+static INLINE void
+recover_io_count(struct raid_set *rs, struct stripe *stripe)
+{
+ if (RSRecover(rs)) {
+ recover_io_reset(rs);
+ atomic_inc(rs->recover.io_count +
+ (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
+ }
+}
+
+/* Read/Write a page_list asynchronously. */
+static void page_list_rw(struct stripe *stripe, unsigned p)
+{
+ struct stripe_cache *sc = stripe->sc;
+ struct raid_set *rs = RS(sc);
+ struct dm_mem_cache_object *obj = stripe->obj + p;
+ struct page_list *pl = obj->pl;
+ struct page *page = pl->page;
+ struct raid_dev *dev = rs->dev + p;
+ struct dm_io_region io = {
+ .bdev = dev->dev->bdev,
+ .sector = stripe->key,
+ .count = stripe->io.size,
+ };
+ struct dm_io_request control = {
+ .bi_rw = PageDirty(page) ? WRITE : READ,
+ .mem.type = DM_IO_PAGE_LIST,
+ .mem.ptr.pl = pl,
+ .mem.offset = 0,
+ .notify.fn = endio,
+ .notify.context = obj,
+ .client = sc->dm_io_client,
+ };
+
+ BUG_ON(PageLocked(page));
+
+ /*
+ * Don't rw past end of device, which can happen, because
+ * typically sectors_per_dev isn't divisable by io_size.
+ */
+ if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
+ io.count = rs->set.sectors_per_dev - io.sector;
+
+ io.sector += dev->start; /* Add <offset>. */
+ recover_io_count(rs, stripe); /* Recovery io accounting. */
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats +
+ (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
+
+ ClearPageError(page);
+ __set_page_locked(page);
+ io_dev_queued(dev);
+ BUG_ON(dm_io(&control, 1, &io, NULL));
+}
+
+/*
+ * Write dirty / read not uptodate page lists of a stripe.
+ */
+static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
+{
+ unsigned r;
+
+ /*
+ * Increment the pending count on the stripe
+ * first, so that we don't race in endio().
+ *
+ * An inc (IO) is needed for any page:
+ *
+ * o not uptodate
+ * o dirtied by writes merged
+ * o dirtied by parity calculations
+ */
+ r = for_each_io_dev(rs, stripe, _stripe_io_inc);
+ if (r) {
+ /* io needed: chunks are not uptodate/dirty. */
+ int max; /* REMOVEME: */
+ struct stripe_cache *sc = &rs->sc;
+
+ if (!TestSetStripeActive(stripe))
+ atomic_inc(&sc->active_stripes);
+
+ /* Take off the lru list in case it got added there. */
+ stripe_lru_del(stripe, LIST_LOCKED);
+
+ /* Submit actual io. */
+ for_each_io_dev(rs, stripe, page_list_rw);
+
+ /* REMOVEME: statistics */
+ max = sc_active(sc);
+ if (atomic_read(&sc->max_active_stripes) < max)
+ atomic_set(&sc->max_active_stripes, max);
+
+ atomic_inc(rs->stats + S_FLUSHS);
+ /* END REMOVEME: statistics */
+ }
+
+ return r;
+}
+
+/* Work in all pending writes. */
+static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
+{
+ struct bio_list *write = BL(stripe, p, WRITE);
+
+ if (!bio_list_empty(write)) {
+ struct page_list *pl = stripe->obj[p].pl;
+ struct bio *bio;
+ struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
+
+ /*
+ * We can play with the lists without holding a lock,
+ * because it is just us accessing them anyway.
+ */
+ bio_list_for_each(bio, write)
+ bio_copy_page_list(WRITE, stripe, pl, bio);
+
+ bio_list_merge(write_merged, write);
+ bio_list_init(write);
+ page_set(pl->page, DIRTY);
+ }
+}
+
+/* Merge in all writes hence dirtying respective pages. */
+static INLINE void writes_merge(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--)
+ _writes_merge(stripe, p);
+}
+
+/* Check, if a chunk gets completely overwritten. */
+static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
+{
+ unsigned sectors = 0;
+ struct bio *bio;
+ struct bio_list *bl = BL(stripe, p, WRITE);
+
+ bio_list_for_each(bio, bl)
+ sectors += bio_sectors(bio);
+
+ return sectors == RS(stripe->sc)->set.io_size;
+}
+
+/*
+ * Prepare stripe to avoid io on broken/reconstructed
+ * drive in order to reconstruct date on endio.
+ */
+enum prepare_type { IO_ALLOW, IO_PROHIBIT };
+static void stripe_prepare(struct stripe *stripe, unsigned p,
+ enum prepare_type type)
+{
+ struct page *page = PAGE(stripe, p);
+
+ switch (type) {
+ case IO_PROHIBIT:
+ /*
+ * In case we prohibit, we gotta make sure, that
+ * io on all other chunks than the one which failed
+ * or is being reconstructed is allowed and that it
+ * doesn't have state uptodate.
+ */
+ stripe_allow_io(stripe);
+ ClearPageUptodate(page);
+ ProhibitPageIO(page);
+
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
+ stripe->idx.recover = p;
+ SetStripeReconstruct(stripe);
+ break;
+
+ case IO_ALLOW:
+ AllowPageIO(page);
+ stripe->idx.recover = -1;
+ ClearStripeReconstruct(stripe);
+ break;
+
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Degraded/reconstruction mode.
+ *
+ * Check stripe state to figure which chunks don't need IO.
+ */
+static INLINE void stripe_check_reconstruct(struct stripe *stripe,
+ int prohibited)
+{
+ struct raid_set *rs = RS(stripe->sc);
+
+ /*
+ * Degraded mode (device(s) failed) ->
+ * avoid io on the failed device.
+ */
+ if (unlikely(raid_set_degraded(rs))) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_DEGRADED);
+ stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
+ return;
+ } else {
+ /*
+ * Reconstruction mode (ie. a particular device or
+ * some (rotating) parity chunk is being resynchronized) ->
+ * o make sure all needed pages are read in
+ * o writes are allowed to go through
+ */
+ int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
+
+ if (r) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_NOSYNC);
+ stripe_prepare(stripe, dev_for_parity(stripe),
+ IO_PROHIBIT);
+ return;
+ }
+ }
+
+ /*
+ * All disks good. Avoid reading parity chunk and reconstruct it
+ * unless we have prohibited io to chunk(s).
+ */
+ if (!prohibited) {
+ if (StripeMerged(stripe))
+ stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
+ else {
+ stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
+
+ /*
+ * Overrule stripe_prepare to reconstruct the
+ * parity chunk, because it'll be created new anyway.
+ */
+ ClearStripeReconstruct(stripe);
+ }
+ }
+}
+
+/* Check, if stripe is ready to merge writes. */
+static INLINE int stripe_check_merge(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int prohibited = 0;
+ unsigned chunks = 0, p = rs->set.raid_devs;
+
+ /* Walk all chunks. */
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ /* Can't merge active chunks. */
+ if (PageLocked(page)) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
+ break;
+ }
+
+ /* Can merge uptodate chunks and have to count parity chunk. */
+ if (PageUptodate(page) || p == stripe->idx.parity) {
+ chunks++;
+ continue;
+ }
+
+ /* Read before write ordering. */
+ if (RSCheckOverwrite(rs) &&
+ bio_list_empty(BL(stripe, p, READ))) {
+ int r = stripe_check_overwrite(stripe, p);
+
+ if (r) {
+ chunks++;
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(stripe->sc)->stats +
+ S_PROHIBITPAGEIO);
+ ProhibitPageIO(page);
+ prohibited = 1;
+ }
+ }
+ }
+
+ if (chunks == rs->set.raid_devs) {
+ /* All pages are uptodate or get written over or mixture. */
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_CAN_MERGE);
+ return 0;
+ } else
+ /* REMOVEME: statistics.*/
+ atomic_inc(rs->stats + S_CANT_MERGE);
+
+ return prohibited ? 1 : -EPERM;
+}
+
+/* Check, if stripe is ready to merge writes. */
+static INLINE int stripe_check_read(struct stripe *stripe)
+{
+ int r = 0;
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ /* Walk all chunks. */
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ if (!PageLocked(page) &&
+ bio_list_empty(BL(stripe, p, READ))) {
+ ProhibitPageIO(page);
+ r = 1;
+ }
+ }
+
+ return r;
+}
+
+/*
+ * Read/write a stripe.
+ *
+ * All stripe read/write activity goes through this function.
+ *
+ * States to cover:
+ * o stripe to read and/or write
+ * o stripe with error to reconstruct
+ */
+static int stripe_rw(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int prohibited = 0, r;
+
+ /*
+ * Check the state of the RAID set and if degraded (or
+ * resynchronizing for reads), read in all other chunks but
+ * the one on the dead/resynchronizing device in order to be
+ * able to reconstruct the missing one.
+ *
+ * Merge all writes hanging off uptodate pages of the stripe.
+ */
+
+ /* Initially allow io on all chunks and prohibit below, if necessary. */
+ stripe_allow_io(stripe);
+
+ if (StripeRBW(stripe)) {
+ r = stripe_check_merge(stripe);
+ if (!r) {
+ /*
+ * If I could rely on valid parity (which would only
+ * be sure in case of a full synchronization),
+ * I could xor a fraction of chunks out of
+ * parity and back in.
+ *
+ * For the time being, I got to redo parity...
+ */
+ /* parity_xor(stripe); */ /* Xor chunks out. */
+ stripe_zero_chunk(stripe, stripe->idx.parity);
+ writes_merge(stripe); /* Merge writes in. */
+ parity_xor(stripe); /* Update parity. */
+ ClearStripeRBW(stripe); /* Disable RBW. */
+ SetStripeMerged(stripe); /* Writes merged. */
+ }
+
+ if (r > 0)
+ prohibited = 1;
+ } else if (!raid_set_degraded(rs))
+ /* Only allow for read avoidance if not degraded. */
+ prohibited = stripe_check_read(stripe);
+
+ /*
+ * Check, if io needs to be allowed/prohibeted on certain chunks
+ * because of a degraded set or reconstruction on a region.
+ */
+ stripe_check_reconstruct(stripe, prohibited);
+
+ /* Now submit any reads/writes. */
+ r = stripe_page_lists_rw(rs, stripe);
+ if (!r) {
+ /*
+ * No io submitted because of chunk io prohibited or
+ * locked pages -> push to end io list for processing.
+ */
+ atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
+ stripe_endio_push(stripe);
+ wake_do_raid(rs); /* Wake myself. */
+ }
+
+ return 0;
+}
+
+/* Flush stripe either via flush list or imeediately. */
+enum flush_type { FLUSH_DELAY, FLUSH_NOW };
+static int stripe_flush(struct stripe *stripe, enum flush_type type)
+{
+ int r = 0;
+
+ stripe_lru_del(stripe, LIST_LOCKED);
+
+ /* Immediately flush. */
+ if (type == FLUSH_NOW) {
+ if (likely(raid_set_operational(RS(stripe->sc))))
+ r = stripe_rw(stripe); /* Read/write stripe. */
+ else
+ /* Optimization: Fail early on failed sets. */
+ stripe_fail_io(stripe);
+ /* Delay flush by putting it on io list for later processing. */
+ } else if (type == FLUSH_DELAY)
+ stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
+ else
+ BUG();
+
+ return r;
+}
+
+/*
+ * Queue reads and writes to a stripe by hanging
+ * their bios off the stripsets read/write lists.
+ *
+ * Endio reads on uptodate chunks.
+ */
+static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
+ struct bio_list *reject)
+{
+ int r = 0;
+ struct address addr;
+ struct stripe *stripe =
+ stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
+
+ if (stripe) {
+ int rr, rw = bio_data_dir(bio);
+
+ rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
+ if (rr) {
+ stripe_put(stripe);
+ goto out;
+ }
+
+ /* Distinguish read and write cases. */
+ bio_list_add(BL(stripe, addr.di, rw), bio);
+
+ /* REMOVEME: statistics */
+ atomic_inc(rs->stats + (rw == WRITE ?
+ S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
+
+ if (rw == READ)
+ SetStripeRead(stripe);
+ else {
+ SetStripeRBW(stripe);
+
+ /* Inrement pending write count on region. */
+ dm_rh_inc(rs->recover.rh, stripe->region);
+ r = 1; /* Region hash needs a flush. */
+ }
+
+ /*
+ * Optimize stripe flushing:
+ *
+ * o directly start io for read stripes.
+ *
+ * o put stripe onto stripe caches io_list for RBW,
+ * so that do_flush() can belabour it after we put
+ * more bios to the stripe for overwrite optimization.
+ */
+ stripe_flush(stripe,
+ StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
+
+ /* Got no stripe from cache -> reject bio. */
+ } else {
+out:
+ bio_list_add(reject, bio);
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_IOS_POST);
+ }
+
+ return r;
+}
+
+/*
+ * Recovery functions
+ */
+/* Read a stripe off a raid set for recovery. */
+static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
+{
+ /* Invalidate all pages so that they get read in. */
+ stripe_pages_invalidate(stripe);
+
+ /* Allow io on all recovery chunks. */
+ stripe_allow_io(stripe);
+
+ if (idx > -1)
+ ProhibitPageIO(PAGE(stripe, idx));
+
+ stripe->key = rs->recover.pos;
+ return stripe_page_lists_rw(rs, stripe);
+}
+
+/* Write a stripe to a raid set for recovery. */
+static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
+{
+ /*
+ * If this is a reconstruct of a particular device, then
+ * reconstruct the respective page(s), else create parity page(s).
+ */
+ if (idx > -1) {
+ struct page *page = PAGE(stripe, idx);
+
+ AllowPageIO(page);
+ stripe_zero_chunk(stripe, idx);
+ common_xor(stripe, stripe->io.size, 0, idx);
+ page_set(page, DIRTY);
+ } else
+ parity_xor(stripe);
+
+ return stripe_page_lists_rw(rs, stripe);
+}
+
+/* Recover bandwidth available ?. */
+static int recover_bandwidth(struct raid_set *rs)
+{
+ int r, work;
+
+ /* On reset -> allow recovery. */
+ r = recover_io_reset(rs);
+ if (r || RSBandwidth(rs))
+ goto out;
+
+ work = atomic_read(rs->recover.io_count + IO_WORK);
+ if (work) {
+ /* Pay attention to larger recover stripe size. */
+ int recover =
+ atomic_read(rs->recover.io_count + IO_RECOVER) *
+ rs->recover.io_size /
+ rs->set.io_size;
+
+ /*
+ * Don't use more than given bandwidth of
+ * the work io for recovery.
+ */
+ if (recover > work / rs->recover.bandwidth_work) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_NO_BANDWIDTH);
+ return 0;
+ }
+ }
+
+out:
+ atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
+ return 1;
+}
+
+/* Try to get a region to recover. */
+static int recover_get_region(struct raid_set *rs)
+{
+ struct recover *rec = &rs->recover;
+ struct dm_region_hash *rh = rec->rh;
+
+ /* Start quiescing some regions. */
+ if (!RSRegionGet(rs)) {
+ int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
+
+ if (r) {
+ r = dm_rh_recovery_prepare(rh);
+ if (r < 0) {
+ DMINFO("No %sregions to recover",
+ rec->nr_regions_to_recover ?
+ "more " : "");
+ return -ENOENT;
+ }
+ } else
+ return -EAGAIN;
+
+ SetRSRegionGet(rs);
+ }
+
+ if (!rec->reg) {
+ rec->reg = dm_rh_recovery_start(rh);
+ if (rec->reg) {
+ /*
+ * A reference for the the region I'll
+ * keep till I've completely synced it.
+ */
+ io_get(rs);
+ rec->pos = dm_rh_region_to_sector(rh,
+ dm_rh_get_region_key(rec->reg));
+ rec->end = rec->pos + dm_rh_get_region_size(rh);
+ return 1;
+ } else
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/* Read/write a recovery stripe. */
+static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
+{
+ /* Read/write flip-flop. */
+ if (TestClearStripeRBW(stripe)) {
+ SetStripeRead(stripe);
+ return recover_read(rs, stripe, idx_get(rs));
+ } else if (TestClearStripeRead(stripe))
+ return recover_write(rs, stripe, idx_get(rs));
+
+ return 0;
+}
+
+/* Reset recovery variables. */
+static void recovery_region_reset(struct raid_set *rs)
+{
+ rs->recover.reg = NULL;
+ ClearRSRegionGet(rs);
+}
+
+/* Update region hash state. */
+static void recover_rh_update(struct raid_set *rs, int error)
+{
+ struct recover *rec = &rs->recover;
+ struct dm_region *reg = rec->reg;
+
+ if (reg) {
+ dm_rh_recovery_end(reg, error);
+ if (!error)
+ rec->nr_regions_recovered++;
+
+ recovery_region_reset(rs);
+ }
+
+ dm_rh_update_states(reg->rh, 1);
+ dm_rh_flush(reg->rh);
+ io_put(rs); /* Release the io reference for the region. */
+}
+
+/* Called by main io daemon to recover regions. */
+/* FIXME: cope with MAX_RECOVER > 1. */
+static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
+{
+ int r;
+ struct recover *rec = &rs->recover;
+
+ /* If recovery is active -> return. */
+ if (StripeActive(stripe))
+ return;
+
+ /* io error is fatal for recovery -> stop it. */
+ if (unlikely(StripeError(stripe)))
+ goto err;
+
+ /* Get a region to recover. */
+ r = recover_get_region(rs);
+ switch (r) {
+ case 1: /* Got a new region. */
+ /* Flag read before write. */
+ ClearStripeRead(stripe);
+ SetStripeRBW(stripe);
+ break;
+
+ case 0:
+ /* Got a region in the works. */
+ r = recover_bandwidth(rs);
+ if (r) /* Got enough bandwidth. */
+ break;
+
+ case -EAGAIN:
+ /* No bandwidth/quiesced region yet, try later. */
+ wake_do_raid_delayed(rs, HZ / 10);
+ return;
+
+ case -ENOENT: /* No more regions. */
+ dm_table_event(rs->ti->table);
+ goto free;
+ }
+
+ /* Read/write a recover stripe. */
+ r = recover_stripe_rw(rs, stripe);
+ if (r) {
+ /* IO initiated, get another reference for the IO. */
+ io_get(rs);
+ return;
+ }
+
+ /* Update recovery position within region. */
+ rec->pos += stripe->io.size;
+
+ /* If we're at end of region, update region hash. */
+ if (rec->pos >= rec->end ||
+ rec->pos >= rs->set.sectors_per_dev)
+ recover_rh_update(rs, 0);
+ else
+ SetStripeRBW(stripe);
+
+ /* Schedule myself for another round... */
+ wake_do_raid(rs);
+ return;
+
+err:
+ raid_set_check_degrade(rs, stripe);
+
+ {
+ char buf[BDEVNAME_SIZE];
+
+ DMERR("stopping recovery due to "
+ "ERROR on /dev/%s, stripe at offset %llu",
+ bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
+ (unsigned long long) stripe->key);
+
+ }
+
+ /* Make sure, that all quiesced regions get released. */
+ do {
+ if (rec->reg)
+ dm_rh_recovery_end(rec->reg, -EIO);
+
+ rec->reg = dm_rh_recovery_start(rec->rh);
+ } while (rec->reg);
+
+ recover_rh_update(rs, -EIO);
+free:
+ rs->set.dev_to_init = -1;
+
+ /* Check for jiffies overrun. */
+ rs->recover.end_jiffies = jiffies;
+ if (rs->recover.end_jiffies < rs->recover.start_jiffies)
+ rs->recover.end_jiffies = ~0;
+
+ ClearRSRecover(rs);
+}
+
+static INLINE void do_recovery(struct raid_set *rs)
+{
+ struct stripe *stripe;
+
+ list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
+ _do_recovery(rs, stripe);
+
+ if (!RSRecover(rs))
+ stripe_recover_free(rs);
+}
+
+/*
+ * END recovery functions
+ */
+
+/* End io process all stripes handed in by endio() callback. */
+static void do_endios(struct raid_set *rs)
+{
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+
+ while ((stripe = stripe_endio_pop(sc))) {
+ unsigned count;
+
+ /* Recovery stripe special case. */
+ if (unlikely(StripeRecover(stripe))) {
+ if (stripe_io(stripe))
+ continue;
+
+ io_put(rs); /* Release region io reference. */
+ ClearStripeActive(stripe);
+
+ /* REMOVEME: statistics*/
+ atomic_dec(&sc->active_stripes);
+ continue;
+ }
+
+ /* Early end io all reads on any uptodate chunks. */
+ stripe_endio(READ, stripe, (count = 0, &count));
+ if (stripe_io(stripe)) {
+ if (count) /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_ACTIVE_READS);
+
+ continue;
+ }
+
+ /* Set stripe inactive after all io got processed. */
+ if (TestClearStripeActive(stripe))
+ atomic_dec(&sc->active_stripes);
+
+ /* Unlock stripe (for clustering). */
+ stripe_unlock(rs, stripe);
+
+ /*
+ * If an io error on a stripe occured and the RAID set
+ * is still operational, requeue the stripe for io.
+ */
+ if (TestClearStripeError(stripe)) {
+ raid_set_check_degrade(rs, stripe);
+ ClearStripeReconstruct(stripe);
+
+ if (!StripeMerged(stripe) &&
+ raid_set_operational(rs)) {
+ stripe_pages_invalidate(stripe);
+ stripe_flush(stripe, FLUSH_DELAY);
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_REQUEUE);
+ continue;
+ }
+ }
+
+ /* Check if the RAID set is inoperational to error ios. */
+ if (!raid_set_operational(rs)) {
+ ClearStripeReconstruct(stripe);
+ stripe_fail_io(stripe);
+ BUG_ON(atomic_read(&stripe->cnt));
+ continue;
+ }
+
+ /* Got to reconstruct a missing chunk. */
+ if (TestClearStripeReconstruct(stripe))
+ reconstruct_xor(stripe);
+
+ /*
+ * Now that we've got a complete stripe, we can
+ * process the rest of the end ios on reads.
+ */
+ BUG_ON(stripe_endio(READ, stripe, NULL));
+ ClearStripeRead(stripe);
+
+ /*
+ * Read-before-write stripes need to be flushed again in
+ * order to work the write data into the pages *after*
+ * they were read in.
+ */
+ if (TestClearStripeMerged(stripe))
+ /* End io all bios which got merged already. */
+ BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
+
+ /* Got to put on flush list because of new writes. */
+ if (StripeRBW(stripe))
+ stripe_flush(stripe, FLUSH_DELAY);
+ }
+}
+
+/*
+ * Stripe cache shrinking.
+ */
+static INLINE void do_sc_shrink(struct raid_set *rs)
+{
+ unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
+
+ if (shrink) {
+ unsigned cur = atomic_read(&rs->sc.stripes);
+
+ sc_shrink(&rs->sc, shrink);
+ shrink -= cur - atomic_read(&rs->sc.stripes);
+ atomic_set(&rs->sc.stripes_to_shrink, shrink);
+
+ /*
+ * Wake myself up in case we failed to shrink the
+ * requested amount in order to try again later.
+ */
+ if (shrink)
+ wake_do_raid(rs);
+ }
+}
+
+
+/*
+ * Process all ios
+ *
+ * We do different things with the io depending on the
+ * state of the region that it's in:
+ *
+ * o reads: hang off stripe cache or postpone if full
+ *
+ * o writes:
+ *
+ * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
+ * In case stripe cache is full or busy, postpone the io.
+ *
+ * RECOVERING: delay the io until recovery of the region completes.
+ *
+ */
+static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
+{
+ int r;
+ unsigned flush = 0;
+ struct dm_region_hash *rh = rs->recover.rh;
+ struct bio *bio;
+ struct bio_list delay, reject;
+
+ bio_list_init(&delay);
+ bio_list_init(&reject);
+
+ /*
+ * Classify each io:
+ * o delay to recovering regions
+ * o queue to all other regions
+ */
+ while ((bio = bio_list_pop(ios))) {
+ /*
+ * In case we get a barrier bio, push it back onto
+ * the input queue unless all work queues are empty
+ * and the stripe cache is inactive.
+ */
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BARRIER);
+ if (!list_empty(rs->sc.lists + LIST_IO) ||
+ !bio_list_empty(&delay) ||
+ !bio_list_empty(&reject) ||
+ sc_active(&rs->sc)) {
+ bio_list_push(ios, bio);
+ break;
+ }
+ }
+
+ r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
+ if (unlikely(r)) {
+ /* Got to wait for recovering regions. */
+ bio_list_add(&delay, bio);
+ SetRSBandwidth(rs);
+ } else {
+ /*
+ * Process ios to non-recovering regions by queueing
+ * them to stripes (does rh_inc()) for writes).
+ */
+ flush += stripe_queue_bio(rs, bio, &reject);
+ }
+ }
+
+ if (flush) {
+ r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
+ if (r)
+ DMERR("dirty log flush");
+ }
+
+ /* Delay ios to regions which are recovering. */
+ while ((bio = bio_list_pop(&delay))) {
+ /* REMOVEME: statistics.*/
+ atomic_inc(rs->stats + S_DELAYED_BIOS);
+ atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
+ dm_rh_delay(rh, bio);
+
+ }
+
+ /* Merge any rejected bios back to the head of the input list. */
+ bio_list_merge_head(ios, &reject);
+}
+
+/* Flush any stripes on the io list. */
+static INLINE void do_flush(struct raid_set *rs)
+{
+ struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
+
+ list_for_each_safe(pos, tmp, list) {
+ int r = stripe_flush(list_entry(pos, struct stripe,
+ lists[LIST_IO]), FLUSH_NOW);
+
+ /* Remove from the list only if the stripe got processed. */
+ if (!r)
+ list_del_init(pos);
+ }
+}
+
+/* Send an event in case we're getting too busy. */
+static INLINE void do_busy_event(struct raid_set *rs)
+{
+ if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
+ if (!TestSetRSScBusy(rs))
+ dm_table_event(rs->ti->table);
+ } else
+ ClearRSScBusy(rs);
+}
+
+/* Unplug: let the io role on the sets devices. */
+static INLINE void do_unplug(struct raid_set *rs)
+{
+ struct raid_dev *dev = rs->dev + rs->set.raid_devs;
+
+ while (dev-- > rs->dev) {
+ /* Only call any device unplug function, if io got queued. */
+ if (io_dev_clear(dev))
+ blk_unplug(bdev_get_queue(dev->dev->bdev));
+ }
+}
+
+/*-----------------------------------------------------------------
+ * RAID daemon
+ *---------------------------------------------------------------*/
+/*
+ * o belabour all end ios
+ * o optionally shrink the stripe cache
+ * o update the region hash states
+ * o optionally do recovery
+ * o grab the input queue
+ * o work an all requeued or new ios and perform stripe cache flushs
+ * unless the RAID set is inoperational (when we error ios)
+ * o check, if the stripe cache gets too busy and throw an event if so
+ * o unplug any component raid devices with queued bios
+ */
+static void do_raid(struct work_struct *ws)
+{
+ struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
+ struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
+ spinlock_t *lock = &rs->io.in_lock;
+
+ /*
+ * We always need to end io, so that ios
+ * can get errored in case the set failed
+ * and the region counters get decremented
+ * before we update the region hash states.
+ */
+redo:
+ do_endios(rs);
+
+ /*
+ * Now that we've end io'd, which may have put stripes on
+ * the LRU list, we shrink the stripe cache if requested.
+ */
+ do_sc_shrink(rs);
+
+ /* Update region hash states before we go any further. */
+ dm_rh_update_states(rs->recover.rh, 1);
+
+ /* Try to recover regions. */
+ if (RSRecover(rs))
+ do_recovery(rs);
+
+ /* More endios -> process. */
+ if (!stripe_endio_empty(&rs->sc)) {
+ atomic_inc(rs->stats + S_REDO);
+ goto redo;
+ }
+
+ /* Quickly grab all new ios queued and add them to the work list. */
+ spin_lock_irq(lock);
+ bio_list_merge(ios, ios_in);
+ bio_list_init(ios_in);
+ spin_unlock_irq(lock);
+
+ /* Let's assume we're operational most of the time ;-). */
+ if (likely(raid_set_operational(rs))) {
+ /* If we got ios, work them into the cache. */
+ if (!bio_list_empty(ios)) {
+ do_ios(rs, ios);
+ do_unplug(rs); /* Unplug the sets device queues. */
+ }
+
+ do_flush(rs); /* Flush any stripes on io list. */
+ do_unplug(rs); /* Unplug the sets device queues. */
+ do_busy_event(rs); /* Check if we got too busy. */
+
+ /* More endios -> process. */
+ if (!stripe_endio_empty(&rs->sc)) {
+ atomic_inc(rs->stats + S_REDO);
+ goto redo;
+ }
+ } else
+ /* No way to reconstruct data with too many devices failed. */
+ bio_list_fail(rs, NULL, ios);
+}
+
+/*
+ * Callback for region hash to dispatch
+ * delayed bios queued to recovered regions
+ * (Gets called via rh_update_states()).
+ */
+static void dispatch_delayed_bios(void *context, struct bio_list *bl)
+{
+ struct raid_set *rs = context;
+ struct bio *bio;
+
+ /* REMOVEME: decrement pending delayed bios counter. */
+ bio_list_for_each(bio, bl)
+ atomic_dec(rs->stats + S_DELAYED_BIOS);
+
+ /* Merge region hash private list to work list. */
+ bio_list_merge_head(&rs->io.work, bl);
+ bio_list_init(bl);
+ ClearRSBandwidth(rs);
+}
+
+/*************************************************************
+ * Constructor helpers
+ *************************************************************/
+/* Calculate MB/sec. */
+static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
+{
+ return to_bytes(speed * rs->set.data_devs *
+ rs->recover.io_size * HZ >> 10) >> 10;
+}
+
+/*
+ * Discover fastest xor algorithm and # of chunks combination.
+ */
+/* Calculate speed for algorithm and # of chunks. */
+static INLINE unsigned xor_speed(struct stripe *stripe)
+{
+ unsigned r = 0;
+ unsigned long j;
+
+ /* Wait for next tick. */
+ for (j = jiffies; j == jiffies;)
+ ;
+
+ /* Do xors for a full tick. */
+ for (j = jiffies; j == jiffies;) {
+ mb();
+ common_xor(stripe, stripe->io.size, 0, 0);
+ mb();
+ r++;
+ mb();
+ }
+
+ return r;
+}
+
+/* Optimize xor algorithm for this RAID set. */
+static unsigned xor_optimize(struct raid_set *rs)
+{
+ unsigned chunks_max = 2, speed_max = 0;
+ struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
+ struct stripe *stripe;
+
+ BUG_ON(list_empty(&rs->recover.stripes));
+ stripe = list_first_entry(&rs->recover.stripes, struct stripe,
+ lists[LIST_RECOVER]);
+
+ /*
+ * Got to allow io on all chunks, so that
+ * xor() will actually work on them.
+ */
+ stripe_allow_io(stripe);
+
+ /* Try all xor functions. */
+ while (f-- > xor_funcs) {
+ unsigned speed;
+
+ /* Set actual xor function for common_xor(). */
+ rs->xor.f = f;
+ rs->xor.chunks = XOR_CHUNKS_MAX + 1;
+
+ while (rs->xor.chunks-- > 2) {
+ speed = xor_speed(stripe);
+ if (speed > speed_max) {
+ speed_max = speed;
+ chunks_max = rs->xor.chunks;
+ f_max = f;
+ }
+ }
+ }
+
+ /* Memorize optimum parameters. */
+ rs->xor.f = f_max;
+ rs->xor.chunks = chunks_max;
+ return speed_max;
+}
+
+static inline int array_too_big(unsigned long fixed, unsigned long obj,
+ unsigned long num)
+{
+ return (num > (ULONG_MAX - fixed) / obj);
+}
+
+static void wakeup_all_recovery_waiters(void *context)
+{
+}
+
+/*
+ * Allocate a RAID context (a RAID set)
+ */
+static int
+context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
+ unsigned stripes, unsigned chunk_size, unsigned io_size,
+ unsigned recover_io_size, unsigned raid_devs,
+ sector_t sectors_per_dev,
+ struct dm_target *ti, unsigned dl_parms, char **argv)
+{
+ int r;
+ unsigned p;
+ size_t len;
+ sector_t region_size, ti_len;
+ struct raid_set *rs = NULL;
+ struct dm_dirty_log *dl;
+ struct recover *rec;
+
+ /*
+ * Create the dirty log
+ *
+ * We need to change length for the dirty log constructor,
+ * because we want an amount of regions for all stripes derived
+ * from the single device size, so that we can keep region
+ * size = 2^^n independant of the number of devices
+ */
+ ti_len = ti->len;
+ ti->len = sectors_per_dev;
+ dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
+ ti->len = ti_len;
+ if (!dl)
+ goto bad_dirty_log;
+
+ /* Chunk size *must* be smaller than region size. */
+ region_size = dl->type->get_region_size(dl);
+ if (chunk_size > region_size)
+ goto bad_chunk_size;
+
+ /* Recover io size *must* be smaller than region size as well. */
+ if (recover_io_size > region_size)
+ goto bad_recover_io_size;
+
+ /* Size and allocate the RAID set structure. */
+ len = sizeof(*rs->data) + sizeof(*rs->dev);
+ if (array_too_big(sizeof(*rs), len, raid_devs))
+ goto bad_array;
+
+ len = sizeof(*rs) + raid_devs * len;
+ rs = kzalloc(len, GFP_KERNEL);
+ if (!rs)
+ goto bad_alloc;
+
+ rec = &rs->recover;
+ atomic_set(&rs->io.in_process, 0);
+ atomic_set(&rs->io.in_process_max, 0);
+ rec->io_size = recover_io_size;
+
+ /* Pointer to data array. */
+ rs->data = (unsigned long **)
+ ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
+ rec->dl = dl;
+ rs->set.raid_devs = p = raid_devs;
+ rs->set.data_devs = raid_devs - raid_type->parity_devs;
+ rs->set.raid_type = raid_type;
+
+ /*
+ * Set chunk and io size and respective shifts
+ * (used to avoid divisions)
+ */
+ rs->set.chunk_size = chunk_size;
+ rs->set.chunk_mask = chunk_size - 1;
+ rs->set.chunk_shift = ffs(chunk_size) - 1;
+
+ rs->set.io_size = io_size;
+ rs->set.io_mask = io_size - 1;
+ rs->set.io_shift = ffs(io_size) - 1;
+ rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
+
+ rs->set.pages_per_io = chunk_pages(io_size);
+ rs->set.sectors_per_dev = sectors_per_dev;
+
+ rs->set.ei = -1; /* Indicate no failed device. */
+ atomic_set(&rs->set.failed_devs, 0);
+
+ rs->ti = ti;
+
+ atomic_set(rec->io_count + IO_WORK, 0);
+ atomic_set(rec->io_count + IO_RECOVER, 0);
+
+ /* Initialize io lock and queues. */
+ spin_lock_init(&rs->io.in_lock);
+ bio_list_init(&rs->io.in);
+ bio_list_init(&rs->io.work);
+
+ init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
+
+ rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
+
+ rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios, wake_do_raid,
+ wakeup_all_recovery_waiters,
+ rs->ti->begin, MAX_RECOVER, dl,
+ region_size, rs->recover.nr_regions);
+ if (IS_ERR(rec->rh))
+ goto bad_rh;
+
+ /* Initialize stripe cache. */
+ r = sc_init(rs, stripes);
+ if (r)
+ goto bad_sc;
+
+ /* Create dm-io client context. */
+ rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
+ rs->set.pages_per_io);
+ if (IS_ERR(rs->sc.dm_io_client))
+ goto bad_dm_io_client;
+
+ /* REMOVEME: statistics. */
+ stats_reset(rs);
+ ClearRSDevelStats(rs); /* Disnable development status. */
+
+ *raid_set = rs;
+ return 0;
+
+bad_dirty_log:
+ TI_ERR_RET("Error creating dirty log", -ENOMEM);
+
+
+bad_chunk_size:
+ dm_dirty_log_destroy(dl);
+ TI_ERR("Chunk size larger than region size");
+
+bad_recover_io_size:
+ dm_dirty_log_destroy(dl);
+ TI_ERR("Recover stripe io size larger than region size");
+
+bad_array:
+ dm_dirty_log_destroy(dl);
+ TI_ERR("Arry too big");
+
+bad_alloc:
+ dm_dirty_log_destroy(dl);
+ TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
+
+bad_rh:
+ dm_dirty_log_destroy(dl);
+ ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
+ goto free_rs;
+
+bad_sc:
+ ti->error = DM_MSG_PREFIX "Error creating stripe cache";
+ goto free;
+
+bad_dm_io_client:
+ ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
+free:
+ dm_region_hash_destroy(rec->rh);
+ sc_exit(&rs->sc);
+ dm_region_hash_destroy(rec->rh); /* Destroys dirty log as well. */
+free_rs:
+ kfree(rs);
+ return -ENOMEM;
+}
+
+/* Free a RAID context (a RAID set). */
+static void
+context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
+{
+ while (r--)
+ dm_put_device(ti, rs->dev[r].dev);
+
+ dm_io_client_destroy(rs->sc.dm_io_client);
+ sc_exit(&rs->sc);
+ dm_region_hash_destroy(rs->recover.rh);
+ dm_dirty_log_destroy(rs->recover.dl);
+ kfree(rs);
+}
+
+/* Create work queue and initialize work. */
+static int rs_workqueue_init(struct raid_set *rs)
+{
+ struct dm_target *ti = rs->ti;
+
+ rs->io.wq = create_singlethread_workqueue(DAEMON);
+ if (!rs->io.wq)
+ TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
+
+ INIT_DELAYED_WORK(&rs->io.dws, do_raid);
+ return 0;
+}
+
+/* Return pointer to raid_type structure for raid name. */
+static struct raid_type *get_raid_type(char *name)
+{
+ struct raid_type *r = ARRAY_END(raid_types);
+
+ while (r-- > raid_types) {
+ if (!strnicmp(STR_LEN(r->name, name)))
+ return r;
+ }
+
+ return NULL;
+}
+
+/* FIXME: factor out to dm core. */
+static int multiple(sector_t a, sector_t b, sector_t *n)
+{
+ sector_t r = a;
+
+ sector_div(r, b);
+ *n = r;
+ return a == r * b;
+}
+
+/* Log RAID set information to kernel log. */
+static void raid_set_log(struct raid_set *rs, unsigned speed)
+{
+ unsigned p;
+ char buf[BDEVNAME_SIZE];
+
+ for (p = 0; p < rs->set.raid_devs; p++)
+ DMINFO("/dev/%s is raid disk %u",
+ bdevname(rs->dev[p].dev->bdev, buf), p);
+
+ DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
+ rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
+ atomic_read(&rs->sc.stripes));
+ DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
+ rs->xor.chunks, mbpers(rs, speed));
+ DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
+ rs->set.data_devs, rs->set.raid_devs);
+}
+
+/* Get all devices and offsets. */
+static int
+dev_parms(struct dm_target *ti, struct raid_set *rs,
+ char **argv, int *p)
+{
+ for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
+ int r;
+ unsigned long long tmp;
+ struct raid_dev *dev = rs->dev + *p;
+ union dev_lookup dl = {.dev = dev };
+
+ /* Get offset and device. */
+ r = sscanf(argv[1], "%llu", &tmp);
+ if (r != 1)
+ TI_ERR("Invalid RAID device offset parameter");
+
+ dev->start = tmp;
- r = dm_get_device(ti, argv[0], dev->start,
- rs->set.sectors_per_dev,
- dm_table_get_mode(ti->table), &dev->dev);
++ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
++ &dev->dev);
+ if (r)
+ TI_ERR_RET("RAID device lookup failure", r);
+
+ r = raid_dev_lookup(rs, bynumber, &dl);
+ if (r != -ENODEV && r < *p) {
+ (*p)++; /* Ensure dm_put_device() on actual device. */
+ TI_ERR_RET("Duplicate RAID device", -ENXIO);
+ }
+ }
+
+ return 0;
+}
+
+/* Set recovery bandwidth. */
+static INLINE void
+recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
+{
+ rs->recover.bandwidth = bandwidth;
+ rs->recover.bandwidth_work = 100 / bandwidth;
+}
+
+/* Handle variable number of RAID parameters. */
+static int
+raid_variable_parms(struct dm_target *ti, char **argv,
+ unsigned i, int *raid_parms,
+ int *chunk_size, int *chunk_size_parm,
+ int *stripes, int *stripes_parm,
+ int *io_size, int *io_size_parm,
+ int *recover_io_size, int *recover_io_size_parm,
+ int *bandwidth, int *bandwidth_parm)
+{
+ /* Fetch # of variable raid parameters. */
+ if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
+ !range_ok(*raid_parms, 0, 5))
+ TI_ERR("Bad variable raid parameters number");
+
+ if (*raid_parms) {
+ /*
+ * If we've got variable RAID parameters,
+ * chunk size is the first one
+ */
+ if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
+ (*chunk_size != -1 &&
+ (!POWER_OF_2(*chunk_size) ||
+ !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
+ TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
+
+ *chunk_size_parm = *chunk_size;
+ if (*chunk_size == -1)
+ *chunk_size = CHUNK_SIZE;
+
+ /*
+ * In case we've got 2 or more variable raid
+ * parameters, the number of stripes is the second one
+ */
+ if (*raid_parms > 1) {
+ if (sscanf(argv[i++], "%d", stripes) != 1 ||
+ (*stripes != -1 &&
+ !range_ok(*stripes, STRIPES_MIN,
+ STRIPES_MAX)))
+ TI_ERR("Invalid number of stripes: must "
+ "be >= 8 and <= 8192");
+ }
+
+ *stripes_parm = *stripes;
+ if (*stripes == -1)
+ *stripes = STRIPES;
+
+ /*
+ * In case we've got 3 or more variable raid
+ * parameters, the io size is the third one.
+ */
+ if (*raid_parms > 2) {
+ if (sscanf(argv[i++], "%d", io_size) != 1 ||
+ (*io_size != -1 &&
+ (!POWER_OF_2(*io_size) ||
+ !range_ok(*io_size, IO_SIZE_MIN,
+ min(BIO_MAX_SECTORS / 2,
+ *chunk_size)))))
+ TI_ERR("Invalid io size; must "
+ "be 2^^n and less equal "
+ "min(BIO_MAX_SECTORS/2, chunk size)");
+ } else
+ *io_size = *chunk_size;
+
+ *io_size_parm = *io_size;
+ if (*io_size == -1)
+ *io_size = *chunk_size;
+
+ /*
+ * In case we've got 4 variable raid parameters,
+ * the recovery stripe io_size is the fourth one
+ */
+ if (*raid_parms > 3) {
+ if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
+ (*recover_io_size != -1 &&
+ (!POWER_OF_2(*recover_io_size) ||
+ !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
+ BIO_MAX_SECTORS / 2))))
+ TI_ERR("Invalid recovery io size; must be "
+ "2^^n and less equal BIO_MAX_SECTORS/2");
+ }
+
+ *recover_io_size_parm = *recover_io_size;
+ if (*recover_io_size == -1)
+ *recover_io_size = RECOVER_IO_SIZE;
+
+ /*
+ * In case we've got 5 variable raid parameters,
+ * the recovery io bandwidth is the fifth one
+ */
+ if (*raid_parms > 4) {
+ if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
+ (*bandwidth != -1 &&
+ !range_ok(*bandwidth, BANDWIDTH_MIN,
+ BANDWIDTH_MAX)))
+ TI_ERR("Invalid recovery bandwidth "
+ "percentage; must be > 0 and <= 100");
+ }
+
+ *bandwidth_parm = *bandwidth;
+ if (*bandwidth == -1)
+ *bandwidth = BANDWIDTH;
+ }
+
+ return 0;
+}
+
+/* Parse optional locking parameters. */
+static int
+raid_locking_parms(struct dm_target *ti, char **argv,
+ unsigned i, int *locking_parms,
+ struct dm_raid45_locking_type **locking_type)
+{
+ *locking_parms = 0;
+ *locking_type = &locking_none;
+
+ if (!strnicmp(argv[i], "none", strlen(argv[i])))
+ *locking_parms = 1;
+ else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
+ *locking_type = &locking_none;
+ *locking_parms = 2;
+ } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
+ *locking_type = &locking_cluster;
+ /* FIXME: namespace. */
+ *locking_parms = 3;
+ }
+
+ return *locking_parms == 1 ? -EINVAL : 0;
+}
+
+/* Set backing device information properties of RAID set. */
+static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
+{
+ unsigned p, ra_pages;
+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+ /* Set read-ahead for the RAID set and the component devices. */
+ bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
+ ra_pages = chunks * chunk_pages(rs->set.io_size);
+ for (p = rs->set.raid_devs; p--; ) {
+ struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+
+ q->backing_dev_info.ra_pages = ra_pages;
+ }
+
+ /* Set congested function and data. */
+ bdi->congested_fn = raid_set_congested;
+ bdi->congested_data = rs;
+
+ dm_put(md);
+}
+
+/* Get backing device information properties of RAID set. */
+static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
+{
+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
+
+ *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
+ / stripe_pages(rs, rs->set.io_size);
+ *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
+ / chunk_pages(rs->set.io_size);
+
+ dm_put(md);
+}
+
+/*
+ * Construct a RAID4/5 mapping:
+ *
+ * log_type #log_params <log_params> \
+ * raid_type [#parity_dev] #raid_variable_params <raid_params> \
+ * [locking "none"/"cluster"]
+ * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
+ *
+ * log_type = "core"/"disk",
+ * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
+ * log_params = [dirty_log_path] region_size [[no]sync])
+ *
+ * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
+ *
+ * #parity_dev = N if raid_type = "raid4"
+ * o N = -1: pick default = last device
+ * o N >= 0 and < #raid_devs: parity device index
+ *
+ * #raid_variable_params = 0-5; raid_params (-1 = default):
+ * [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
+ * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
+ * and <= CHUNK_SIZE_MAX)
+ * o #stripes is number of stripes allocated to stripe cache
+ * (must be > 1 and < STRIPES_MAX)
+ * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
+ * o recover_io_size (io unit size per device for recovery in sectors;
+ must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
+ * o %recovery_bandwith is the maximum amount spend for recovery during
+ * application io (1-100%)
+ * If raid_variable_params = 0, defaults will be used.
+ * Any raid_variable_param can be set to -1 to apply a default
+ *
+ * #raid_devs = N (N >= 3)
+ *
+ * #dev_to_initialize = N
+ * -1: initialize parity on all devices
+ * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
+ * of a failed devices content after replacement
+ *
+ * <dev_path> = device_path (eg, /dev/sdd1)
+ * <offset> = begin at offset on <dev_path>
+ *
+ */
+#define MIN_PARMS 13
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int bandwidth = BANDWIDTH, bandwidth_parm = -1,
+ chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
+ dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
+ i, io_size = IO_SIZE, io_size_parm = -1,
+ r, raid_devs, raid_parms,
+ recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
+ stripes = STRIPES, stripes_parm = -1;
+ unsigned speed;
+ sector_t tmp, sectors_per_dev;
+ struct dm_raid45_locking_type *locking;
+ struct raid_set *rs;
+ struct raid_type *raid_type;
+
+ /* Ensure minimum number of parameters. */
+ if (argc < MIN_PARMS)
+ TI_ERR("Not enough parameters");
+
+ /* Fetch # of dirty log parameters. */
+ if (sscanf(argv[1], "%d", &dl_parms) != 1
+ || !range_ok(dl_parms, 1, 4711))
+ TI_ERR("Bad dirty log parameters number");
+
+ /* Check raid_type. */
+ raid_type = get_raid_type(argv[dl_parms + 2]);
+ if (!raid_type)
+ TI_ERR("Bad raid type");
+
+ /* In case of RAID4, parity drive is selectable. */
+ parity_parm = !!(raid_type->level == raid4);
+
+ /* Handle variable number of RAID parameters. */
+ r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
+ &raid_parms,
+ &chunk_size, &chunk_size_parm,
+ &stripes, &stripes_parm,
+ &io_size, &io_size_parm,
+ &recover_io_size, &recover_io_size_parm,
+ &bandwidth, &bandwidth_parm);
+ if (r)
+ return r;
+
+ r = raid_locking_parms(ti, argv,
+ dl_parms + parity_parm + raid_parms + 4,
+ &locking_parms, &locking);
+ if (r)
+ return r;
+
+ /* # of raid devices. */
+ i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
+ if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
+ raid_devs < raid_type->minimal_devs)
+ TI_ERR("Invalid number of raid devices");
+
+ /* In case of RAID4, check parity drive index is in limits. */
+ if (raid_type->level == raid4) {
+ /* Fetch index of parity device. */
+ if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
+ !range_ok(pi, 0, raid_devs - 1))
+ TI_ERR("Invalid RAID4 parity device index");
+ }
+
+ /*
+ * Index of device to initialize starts at 0
+ *
+ * o -1 -> don't initialize a particular device,
+ * o 0..raid_devs-1 -> initialize respective device
+ * (used for reconstruction of a replaced device)
+ */
+ if (sscanf
+ (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
+ "%d", &dev_to_init) != 1
+ || !range_ok(dev_to_init, -1, raid_devs - 1))
+ TI_ERR("Invalid number for raid device to initialize");
+
+ /* Check # of raid device arguments. */
+ if (argc - dl_parms - parity_parm - raid_parms - 6 !=
+ 2 * raid_devs)
+ TI_ERR("Wrong number of raid device/offset arguments");
+
+ /*
+ * Check that the table length is devisable
+ * w/o rest by (raid_devs - parity_devs)
+ */
+ if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
+ §ors_per_dev))
+ TI_ERR
+ ("Target length not divisable by number of data devices");
+
+ /*
+ * Check that the device size is
+ * devisable w/o rest by chunk size
+ */
+ if (!multiple(sectors_per_dev, chunk_size, &tmp))
+ TI_ERR("Device length not divisable by chunk_size");
+
+ /****************************************************************
+ * Now that we checked the constructor arguments ->
+ * let's allocate the RAID set
+ ****************************************************************/
+ r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
+ recover_io_size, raid_devs, sectors_per_dev,
+ ti, dl_parms, argv);
+ if (r)
+ return r;
+
+ /*
+ * Set these here in order to avoid passing
+ * too many arguments to context_alloc()
+ */
+ rs->set.dev_to_init_parm = dev_to_init;
+ rs->set.dev_to_init = dev_to_init;
+ rs->set.pi_parm = pi;
+ rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
+ rs->set.raid_parms = raid_parms;
+ rs->set.chunk_size_parm = chunk_size_parm;
+ rs->set.io_size_parm = io_size_parm;
+ rs->sc.stripes_parm = stripes_parm;
+ rs->recover.io_size_parm = recover_io_size_parm;
+ rs->recover.bandwidth_parm = bandwidth_parm;
+ recover_set_bandwidth(rs, bandwidth);
+
+ /* Use locking type to lock stripe access. */
+ rs->locking = locking;
+
+ /* Get the device/offset tupels. */
+ argv += dl_parms + 6 + parity_parm + raid_parms;
+ r = dev_parms(ti, rs, argv, &i);
+ if (r)
+ goto err;
+
+ /* Initialize recovery. */
+ rs->recover.start_jiffies = jiffies;
+ rs->recover.end_jiffies = 0;
+ recovery_region_reset(rs);
+
+ /* Allow for recovery of any nosync regions. */
+ SetRSRecover(rs);
+
+ /* Set backing device information (eg. read ahead). */
+ rs_set_bdi(rs, chunk_size * 2, io_size * 4);
+ SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
+
+ speed = xor_optimize(rs); /* Select best xor algorithm. */
+
+ /* Initialize work queue to handle this RAID set's io. */
+ r = rs_workqueue_init(rs);
+ if (r)
+ goto err;
+
+ raid_set_log(rs, speed); /* Log information about RAID set. */
+
+ /*
+ * Make sure that dm core only hands maximum io size
+ * length down and pays attention to io boundaries.
+ */
+ ti->split_io = rs->set.io_size;
+ ti->private = rs;
+ return 0;
+
+err:
+ context_free(rs, ti, i);
+ return r;
+}
+
+/*
+ * Destruct a raid mapping
+ */
+static void raid_dtr(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ /* Indicate recovery end so that ios in flight drain. */
+ ClearRSRecover(rs);
+
+ wake_do_raid(rs); /* Wake daemon. */
+ wait_ios(rs); /* Wait for any io still being processed. */
+ destroy_workqueue(rs->io.wq);
+ context_free(rs, ti, rs->set.raid_devs);
+}
+
+/* Queues ios to RAID sets. */
+static inline void queue_bio(struct raid_set *rs, struct bio *bio)
+{
+ int wake;
+ struct bio_list *in = &rs->io.in;
+ spinlock_t *in_lock = &rs->io.in_lock;
+
+ spin_lock_irq(in_lock);
+ wake = bio_list_empty(in);
+ bio_list_add(in, bio);
+ spin_unlock_irq(in_lock);
+
+ /* Wake daemon if input list was empty. */
+ if (wake)
+ wake_do_raid(rs);
+}
+
+/* Raid mapping function. */
+static int raid_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ /* I don't want to waste stripe cache capacity. */
+ if (bio_rw(bio) == READA)
+ return -EIO;
+ else {
+ struct raid_set *rs = ti->private;
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats +
+ (bio_data_dir(bio) == WRITE ?
+ S_BIOS_WRITE : S_BIOS_READ));
+
+ /*
+ * Get io reference to be waiting for to drop
+ * to zero on device suspension/destruction.
+ */
+ io_get(rs);
+ bio->bi_sector -= ti->begin; /* Remap sector. */
+ queue_bio(rs, bio); /* Queue to the daemon. */
+ return DM_MAPIO_SUBMITTED; /* Handle later. */
+ }
+}
+
+/* Device suspend. */
+static void raid_postsuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct dm_dirty_log *dl = rs->recover.dl;
+
+ SetRSSuspended(rs);
+
+ if (RSRecover(rs))
+ dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
+ else
+ wake_do_raid(rs);
+
+ wait_ios(rs); /* Wait for completion of all ios being processed. */
+ if (dl->type->postsuspend && dl->type->postsuspend(dl))
+ /* Suspend dirty log. */
+ /* FIXME: need better error handling. */
+ DMWARN("log suspend failed");
+}
+
+/* Device resume. */
+static void raid_resume(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct recover *rec = &rs->recover;
+ struct dm_dirty_log *dl = rec->dl;
+
+ if (dl->type->resume && dl->type->resume(dl))
+ /* Resume dirty log. */
+ /* FIXME: need better error handling. */
+ DMWARN("log resume failed");
+
+ rec->nr_regions_to_recover =
+ rec->nr_regions - dl->type->get_sync_count(dl);
+
+ ClearRSSuspended(rs);
+
+ /* Reset any unfinished recovery. */
+ if (RSRecover(rs)) {
+ recovery_region_reset(rs);
+ dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
+ } else
+ wake_do_raid(rs);
+}
+
+static INLINE unsigned sc_size(struct raid_set *rs)
+{
+ return to_sector(atomic_read(&rs->sc.stripes) *
+ (sizeof(struct stripe) +
+ (sizeof(struct stripe_set) +
+ (sizeof(struct page_list) +
+ to_bytes(rs->set.io_size) *
+ rs->set.raid_devs)) +
+ (rs->recover.
+ end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
+ rs->recover.
+ io_size))));
+}
+
+/* REMOVEME: status output for development. */
+static void
+raid_devel_stats(struct dm_target *ti, char *result,
+ unsigned *size, unsigned maxlen)
+{
+ unsigned chunks, stripes, sz = *size;
+ unsigned long j;
+ char buf[BDEVNAME_SIZE], *p;
+ struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
+ struct raid_set *rs = ti->private;
+ struct recover *rec = &rs->recover;
+ struct timespec ts;
+
+ DMEMIT("%s ", version);
+ DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
+ DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
+
+ for (sm = stats_map; sm < sm_end; sm++)
+ DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
+
+ DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
+ DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
+ atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
+ sc_size(rs));
+
+ j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
+ rec->start_jiffies;
+ jiffies_to_timespec(j, &ts);
+ sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
+ p = strchr(buf, '.');
+ p[3] = 0;
+
+ DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
+ (unsigned long long) rec->nr_regions_recovered,
+ RSRegionGet(rs) ? "+" : "",
+ (unsigned long long) rec->nr_regions_to_recover,
+ (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
+
+ rs_get_ra(rs, &stripes, &chunks);
+ DMEMIT("ra=%u/%u ", stripes, chunks);
+
+ *size = sz;
+}
+
+static int
+raid_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ unsigned i, sz = 0;
+ char buf[BDEVNAME_SIZE];
+ struct raid_set *rs = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ /* REMOVEME: statistics. */
+ if (RSDevelStats(rs))
+ raid_devel_stats(ti, result, &sz, maxlen);
+
+ DMEMIT("%u ", rs->set.raid_devs);
+
+ for (i = 0; i < rs->set.raid_devs; i++)
+ DMEMIT("%s ",
+ format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
+
+ DMEMIT("1 ");
+ for (i = 0; i < rs->set.raid_devs; i++) {
+ DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
+
+ if (rs->set.raid_type->level == raid4 &&
+ i == rs->set.pi)
+ DMEMIT("p");
+
+ if (rs->set.dev_to_init == i)
+ DMEMIT("i");
+ }
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ sz = rs->recover.dl->type->status(rs->recover.dl, type,
+ result, maxlen);
+ DMEMIT("%s %u ", rs->set.raid_type->name,
+ rs->set.raid_parms);
+
+ if (rs->set.raid_type->level == raid4)
+ DMEMIT("%d ", rs->set.pi_parm);
+
+ if (rs->set.raid_parms)
+ DMEMIT("%d ", rs->set.chunk_size_parm);
+
+ if (rs->set.raid_parms > 1)
+ DMEMIT("%d ", rs->sc.stripes_parm);
+
+ if (rs->set.raid_parms > 2)
+ DMEMIT("%d ", rs->set.io_size_parm);
+
+ if (rs->set.raid_parms > 3)
+ DMEMIT("%d ", rs->recover.io_size_parm);
+
+ if (rs->set.raid_parms > 4)
+ DMEMIT("%d ", rs->recover.bandwidth_parm);
+
+ DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
+
+ for (i = 0; i < rs->set.raid_devs; i++)
+ DMEMIT("%s %llu ",
+ format_dev_t(buf,
+ rs->dev[i].dev->bdev->bd_dev),
+ (unsigned long long) rs->dev[i].start);
+ }
+
+ return 0;
+}
+
+/*
+ * Message interface
+ */
+enum raid_msg_actions {
+ act_bw, /* Recovery bandwidth switch. */
+ act_dev, /* Device failure switch. */
+ act_overwrite, /* Stripe overwrite check. */
+ act_read_ahead, /* Set read ahead. */
+ act_stats, /* Development statistics switch. */
+ act_sc, /* Stripe cache switch. */
+
+ act_on, /* Set entity on. */
+ act_off, /* Set entity off. */
+ act_reset, /* Reset entity. */
+
+ act_set = act_on, /* Set # absolute. */
+ act_grow = act_off, /* Grow # by an amount. */
+ act_shrink = act_reset, /* Shrink # by an amount. */
+};
+
+/* Turn a delta to absolute. */
+static int _absolute(unsigned long action, int act, int r)
+{
+ /* Make delta absolute. */
+ if (test_bit(act_set, &action))
+ ;
+ else if (test_bit(act_grow, &action))
+ r += act;
+ else if (test_bit(act_shrink, &action))
+ r = act - r;
+ else
+ r = -EINVAL;
+
+ return r;
+}
+
+ /* Change recovery io bandwidth. */
+static int bandwidth_change(struct dm_msg *msg, void *context)
+{
+ struct raid_set *rs = context;
+ int act = rs->recover.bandwidth;
+ int bandwidth = DM_MSG_INT_ARG(msg);
+
+ if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ /* Make delta bandwidth absolute. */
+ bandwidth = _absolute(msg->action, act, bandwidth);
+
+ /* Check range. */
+ if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ recover_set_bandwidth(rs, bandwidth);
+ return 0;
+ }
+ }
+
+ set_bit(dm_msg_ret_arg, &msg->ret);
+ set_bit(dm_msg_ret_inval, &msg->ret);
+ return -EINVAL;
+}
+
+/* Change state of a device (running/offline). */
+/* FIXME: this only works while recovering!. */
+static int device_state(struct dm_msg *msg, void *context)
+{
+ int r;
+ const char *str = "is already ";
+ union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
+ struct raid_set *rs = context;
+
+ r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
+ bymajmin : byname, &dl);
+ if (r == -ENODEV) {
+ DMERR("device %s is no member of this set", dl.dev_name);
+ return r;
+ }
+
+ if (test_bit(act_off, &msg->action)) {
+ if (dev_operational(rs, r))
+ str = "";
+ } else if (!dev_operational(rs, r))
+ str = "";
+
+ DMINFO("/dev/%s %s%s", dl.dev_name, str,
+ test_bit(act_off, &msg->action) ? "offline" : "running");
+
+ return test_bit(act_off, &msg->action) ?
+ raid_set_check_and_degrade(rs, NULL, r) :
+ raid_set_check_and_upgrade(rs, r);
+}
+
+/* Set/reset development feature flags. */
+static int devel_flags(struct dm_msg *msg, void *context)
+{
+ struct raid_set *rs = context;
+
+ if (test_bit(act_on, &msg->action))
+ return test_and_set_bit(msg->spec->parm,
+ &rs->io.flags) ? -EPERM : 0;
+ else if (test_bit(act_off, &msg->action))
+ return test_and_clear_bit(msg->spec->parm,
+ &rs->io.flags) ? 0 : -EPERM;
+ else if (test_bit(act_reset, &msg->action)) {
+ if (test_bit(act_stats, &msg->action)) {
+ stats_reset(rs);
+ goto on;
+ } else if (test_bit(act_overwrite, &msg->action)) {
+on:
+ set_bit(msg->spec->parm, &rs->io.flags);
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+ /* Set stripe and chunk read ahead pages. */
+static int read_ahead_set(struct dm_msg *msg, void *context)
+{
+ int stripes = DM_MSG_INT_ARGS(msg, 0);
+ int chunks = DM_MSG_INT_ARGS(msg, 1);
+
+ if (range_ok(stripes, 1, 512) &&
+ range_ok(chunks, 1, 512)) {
+ rs_set_bdi(context, stripes, chunks);
+ return 0;
+ }
+
+ set_bit(dm_msg_ret_arg, &msg->ret);
+ set_bit(dm_msg_ret_inval, &msg->ret);
+ return -EINVAL;
+}
+
+/* Resize the stripe cache. */
+static int stripecache_resize(struct dm_msg *msg, void *context)
+{
+ int act, stripes;
+ struct raid_set *rs = context;
+
+ /* Deny permission in case the daemon is still shrinking!. */
+ if (atomic_read(&rs->sc.stripes_to_shrink))
+ return -EPERM;
+
+ stripes = DM_MSG_INT_ARG(msg);
+ if (stripes > 0) {
+ act = atomic_read(&rs->sc.stripes);
+
+ /* Make delta stripes absolute. */
+ stripes = _absolute(msg->action, act, stripes);
+
+ /*
+ * Check range and that the # of stripes changes.
+ * We can grow from gere but need to leave any
+ * shrinking to the worker for synchronization.
+ */
+ if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
+ if (stripes > act)
+ return sc_grow(&rs->sc, stripes - act, SC_GROW);
+ else if (stripes < act) {
+ atomic_set(&rs->sc.stripes_to_shrink,
+ act - stripes);
+ wake_do_raid(rs);
+ }
+
+ return 0;
+ }
+ }
+
+ set_bit(dm_msg_ret_arg, &msg->ret);
+ set_bit(dm_msg_ret_inval, &msg->ret);
+ return -EINVAL;
+}
+
+/* Parse the RAID message action. */
+/*
+ * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
+ * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
+ * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
+ * "r[ead_ahead] set #stripes #chunks # e.g. 'r se 3 2'
+ * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
+ * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
+ *
+ */
+static int
+raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ /* Variables to store the parsed parameters im. */
+ static int i[2];
+ static unsigned long *i_arg[] = {
+ (unsigned long *) i + 0,
+ (unsigned long *) i + 1,
+ };
+ static char *p;
+ static unsigned long *p_arg[] = { (unsigned long *) &p };
+
+ /* Declare all message option strings. */
+ static char *str_sgs[] = { "set", "grow", "shrink" };
+ static char *str_dev[] = { "running", "offline" };
+ static char *str_oor[] = { "on", "off", "reset" };
+
+ /* Declare all actions. */
+ static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
+ static unsigned long act_oor[] = { act_on, act_off, act_reset };
+
+ /* Bandwidth option. */
+ static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
+ static struct dm_message_argument bw_args = {
+ 1, i_arg, { dm_msg_int_t }
+ };
+
+ /* Device option. */
+ static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
+ static struct dm_message_argument dev_args = {
+ 1, p_arg, { dm_msg_base_t }
+ };
+
+ /* Read ahead option. */
+ static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
+ static struct dm_message_argument ra_args = {
+ 2, i_arg, { dm_msg_int_t, dm_msg_int_t }
+ };
+
+ static struct dm_message_argument null_args = {
+ 0, NULL, { dm_msg_int_t }
+ };
+
+ /* Overwrite and statistics option. */
+ static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
+
+ /* Sripecache option. */
+ static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
+
+ /* Declare messages. */
+ static struct dm_msg_spec specs[] = {
+ { "bandwidth", act_bw, &bw_opt, &bw_args,
+ 0, bandwidth_change },
+ { "device", act_dev, &dev_opt, &dev_args,
+ 0, device_state },
+ { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
+ RS_CHECK_OVERWRITE, devel_flags },
+ { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
+ 0, read_ahead_set },
+ { "statistics", act_stats, &ovr_stats_opt, &null_args,
+ RS_DEVEL_STATS, devel_flags },
+ { "stripecache", act_sc, &stripe_opt, &bw_args,
+ 0, stripecache_resize },
+ };
+
+ /* The message for the parser. */
+ struct dm_msg msg = {
+ .num_specs = ARRAY_SIZE(specs),
+ .specs = specs,
+ };
+
+ return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
+}
+/*
+ * END message interface
+ */
+
+static struct target_type raid_target = {
+ .name = "raid45",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = raid_ctr,
+ .dtr = raid_dtr,
+ .map = raid_map,
+ .postsuspend = raid_postsuspend,
+ .resume = raid_resume,
+ .status = raid_status,
+ .message = raid_message,
+};
+
+static void init_exit(const char *bad_msg, const char *good_msg, int r)
+{
+ if (r)
+ DMERR("Failed to %sregister target [%d]", bad_msg, r);
+ else
+ DMINFO("%s %s", good_msg, version);
+}
+
+static int __init dm_raid_init(void)
+{
+ int r;
+
+ r = dm_register_target(&raid_target);
+ init_exit("", "initialized", r);
+ return r;
+}
+
+static void __exit dm_raid_exit(void)
+{
+ dm_unregister_target(&raid_target);
+ init_exit("un", "exit", 0);
+}
+
+/* Module hooks. */
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
+MODULE_LICENSE("GPL");
struct uvc_control_mapping *mapping;
struct uvc_menu_info *menu;
unsigned int i;
- __u8 *data;
int ret;
+ if ((chain->dev->quirks & UVC_QUIRK_HUE_EPIPE) &&
+ (v4l2_ctrl->id == V4L2_CID_HUE))
+ return -EINVAL;
+
ctrl = uvc_find_control(chain, v4l2_ctrl->id, &mapping);
if (ctrl == NULL)
return -EINVAL;
[board_82575] = &e1000_82575_info,
};
+static int entropy = 0;
+module_param(entropy, int, 0);
+MODULE_PARM_DESC(entropy, "Allow igb to populate the /dev/random entropy pool");
+
- static struct pci_device_id igb_pci_tbl[] = {
+ static DEFINE_PCI_DEVICE_TABLE(igb_pci_tbl) = {
{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_COPPER), board_82575 },
{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_FIBER), board_82575 },
{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_SERDES), board_82575 },
skb = re->skb;
sky2_rx_unmap_skb(sky2->hw->pdev, re);
-
prefetch(skb->data);
- re->skb = nskb;
- if (sky2_rx_map_skb(sky2->hw->pdev, re, hdr_space)) {
- dev_kfree_skb(nskb);
- re->skb = skb;
- return NULL;
- }
+ *re = nre;
if (skb_shinfo(skb)->nr_frags)
- skb_put_frags(skb, hdr_space, length);
+ skb_put_frags(sky2, skb, hdr_space, length);
else
skb_put(skb, length);
return skb;
#include "tg3.h"
+static int entropy = 0;
+module_param(entropy, int, 0);
+MODULE_PARM_DESC(entropy, "Allow tg3 to populate the /dev/random entropy pool");
+
#define DRV_MODULE_NAME "tg3"
- #define PFX DRV_MODULE_NAME ": "
- #define DRV_MODULE_VERSION "3.106"
- #define DRV_MODULE_RELDATE "January 12, 2010"
+ #define DRV_MODULE_VERSION "3.108"
+ #define DRV_MODULE_RELDATE "February 17, 2010"
#define TG3_DEF_MAC_MODE 0
#define TG3_DEF_RX_MODE 0
depends on PCMCIA && (BROKEN || !M32R)
select WIRELESS_EXT
select WEXT_SPY
+ select WEXT_PRIV
select CRYPTO
- select CRYPTO_AES
---help---
This is the standard Linux driver to support Cisco/Aironet PCMCIA
802.11 wireless cards. This driver is the same as the Aironet
default_int_mode:
#endif /* CONFIG_PCI_MSI */
/* if we get here we're going to use the default interrupt mode */
- h->intr[SIMPLE_MODE_INT] = pdev->irq;
- return;
+ h->intr[PERF_MODE_INT] = pdev->irq;
}
-static int hpsa_pci_init(struct ctlr_info *h, struct pci_dev *pdev)
+static int __devinit hpsa_pci_init(struct ctlr_info *h, struct pci_dev *pdev)
{
ushort subsystem_vendor_id, subsystem_device_id, command;
- __u32 board_id, scratchpad = 0;
- __u64 cfg_offset;
- __u32 cfg_base_addr;
- __u64 cfg_base_addr_index;
+ u32 board_id, scratchpad = 0;
+ u64 cfg_offset;
+ u32 cfg_base_addr;
+ u64 cfg_base_addr_index;
+ u32 trans_offset;
int i, prod_index, err;
subsystem_vendor_id = pdev->subsystem_vendor;
#include <linux/platform_device.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
+ #include <linux/pm_runtime.h>
+#ifdef CONFIG_KDB_USB
+#include <linux/kdb.h>
+#endif
#include <linux/usb.h>
*/
int (*update_hub_device)(struct usb_hcd *, struct usb_device *hdev,
struct usb_tt *tt, gfp_t mem_flags);
+ int (*reset_device)(struct usb_hcd *, struct usb_device *);
+
+#ifdef CONFIG_KDB_USB
+ /* KDB poll function for this HC */
+ int (*kdb_poll_char)(struct urb *urb);
+ void (*kdb_completion)(struct urb *urb);
+ kdb_hc_keyboard_attach_t kdb_hc_keyboard_attach;
+ kdb_hc_keyboard_detach_t kdb_hc_keyboard_detach;
+#endif /* CONFIG_KDB_USB */
};
extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb);
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
+ #include <linux/quotaops.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
+#include "namei.h"
#include "xattr.h"
#include "acl.h"
+#include "nfs4acl.h"
/*
* Called when an inode is released. Note that this is different
sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
ret = inode;
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
- err = ext3_init_acl(handle, inode, dir);
+ if (test_opt(sb, NFS4ACL))
+ err = ext3_nfs4acl_init(handle, inode, dir);
+ else
+ err = ext3_init_acl(handle, inode, dir);
if (err)
goto fail_free_drop;
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
+#include <linux/nfs4acl.h>
#include <linux/log2.h>
- #include <linux/precache.h>
#include <asm/uaccess.h>
static void ext3_clear_inode(struct inode *inode)
{
struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
+
+#ifdef CONFIG_EXT3_FS_NFS4ACL
+ if (EXT3_I(inode)->i_nfs4acl &&
+ EXT3_I(inode)->i_nfs4acl != EXT3_NFS4ACL_NOT_CACHED) {
+ nfs4acl_put(EXT3_I(inode)->i_nfs4acl);
+ EXT3_I(inode)->i_nfs4acl = EXT3_NFS4ACL_NOT_CACHED;
+ }
+#endif
+ dquot_drop(inode);
ext3_discard_reservation(inode);
EXT3_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
NULL, 0))
goto failed_mount;
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
- if (sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL)
++ if (test_opt(sb, POSIX_ACL))
+ sb->s_flags |= MS_POSIXACL;
- if (sbi->s_mount_opt & EXT3_MOUNT_NFS4ACL)
++ if (test_opt(sb, NFS4ACL))
+ sb->s_flags |= MS_POSIXACL | MS_WITHAPPEND;
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
goto restore_opts;
}
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ if (test_opt(sb, ABORT))
ext3_abort(sb, __func__, "Abort forced by user");
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
- if (sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL)
++ if (test_opt(sb, POSIX_ACL))
+ sb->s_flags |= MS_POSIXACL;
- if (sbi->s_mount_opt & EXT3_MOUNT_NFS4ACL)
++ if (test_opt(sb, NFS4ACL))
+ sb->s_flags |= MS_POSIXACL;
+
es = sbi->s_es;
return -ENOENT;
BUG_ON(victim->d_parent->d_inode != dir);
- audit_inode_child(victim->d_name.name, victim, dir);
+ audit_inode_child(victim, dir);
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ if (dir->i_op->may_delete) {
+ if (IS_RDONLY(dir))
+ return -EROFS;
+ if (IS_IMMUTABLE(dir))
+ return -EACCES;
+ error = dir->i_op->may_delete(dir, victim->d_inode);
+ if (!error)
+ error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ } else {
+ error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ if (!error && check_sticky(dir, victim->d_inode))
+ error = -EPERM;
+ }
if (error)
return error;
if (IS_APPEND(dir))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+
+ if (dir->i_op->may_create) {
+ if (IS_RDONLY(dir))
+ return -EROFS;
+ if (IS_IMMUTABLE(dir))
+ return -EACCES;
+ error = dir->i_op->may_create(dir, isdir);
+ if (!error)
+ error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ } else
+ error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+
+ return error;
}
- /*
- * O_DIRECTORY translates into forcing a directory lookup.
- */
- static inline int lookup_flags(unsigned int f)
- {
- unsigned long retval = LOOKUP_FOLLOW;
-
- if (f & O_NOFOLLOW)
- retval &= ~LOOKUP_FOLLOW;
-
- if (f & O_DIRECTORY)
- retval |= LOOKUP_DIRECTORY;
-
- return retval;
- }
-
/*
* p1 and p2 should be directories on the same fs.
*/
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index,
NFS_PAGE_TAG_COMMIT);
+ nfsi->ncommit++;
spin_unlock(&inode->i_lock);
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+ inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ BDI_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
put_write_access(inode);
goto out_nfserr;
}
- vfs_dq_init(inode);
+
+ /*
+ * Tell a Hierarchical Storage Manager (e.g. via DMAPI) to
+ * return EAGAIN when an action would take minutes instead of
+ * milliseconds so that NFS can reply to the client with
+ * NFSERR_JUKEBOX instead of blocking an nfsd thread.
+ */
+ if (rqstp->rq_vers >= 3)
+ iap->ia_valid |= ATTR_NO_BLOCK;
}
/* sanitize the mode change */
u64 ac_last_group;
u64 ac_max_block; /* Highest block number to allocate. 0 is
is the same as ~0 - unlimited */
+
+ struct ocfs2_alloc_reservation *ac_resv;
};
+ void ocfs2_init_steal_slots(struct ocfs2_super *osb);
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
{
return err;
}
-/**
- * do_remount_sb - asks filesystem to change mount options.
- * @sb: superblock in question
- * @flags: numeric part of options
- * @data: the rest of options
- * @force: whether or not to force the change
- *
- * Alters the mount options of a mounted file system.
- */
-int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+#define REMOUNT_FORCE 1
+#define REMOUNT_SHRINK_DCACHE 2
+
+static int __do_remount_sb(struct super_block *sb, int flags, void *data, int rflags)
{
int retval;
- int remount_rw;
+ int remount_rw, remount_ro;
if (sb->s_frozen != SB_UNFROZEN)
return -EBUSY;
if (flags & MS_RDONLY)
acct_auto_close(sb);
- shrink_dcache_sb(sb);
+ if (rflags & REMOUNT_SHRINK_DCACHE)
+ shrink_dcache_sb(sb);
sync_filesystem(sb);
+ remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+ remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
+
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
- if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+ if (remount_ro) {
- if (force)
+ if (rflags & REMOUNT_FORCE)
mark_files_ro(sb);
else if (!fs_may_remount_ro(sb))
return -EBUSY;
xfs_globals.o \
xfs_ioctl.o \
xfs_iops.o \
- xfs_lrw.o \
xfs_super.o \
xfs_sync.o \
- xfs_xattr.o)
+ xfs_xattr.o \
+ xfs_ksyms.o)
# Objects in support/
xfs-y += $(addprefix support/, \
--- /dev/null
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_inode_item.h"
+#include "xfs_vnodeops.h"
+#include <dmapi.h>
+#include <dmapi_kern.h>
+#include "xfs_dm.h"
+
+#include <linux/mount.h>
+
+#define MAXNAMLEN MAXNAMELEN
+
+#define MIN_DIO_SIZE(mp) ((mp)->m_sb.sb_sectsize)
+#define MAX_DIO_SIZE(mp) (INT_MAX & ~(MIN_DIO_SIZE(mp) - 1))
+
+static void up_rw_sems(struct inode *ip, int flags)
+{
+ if (flags & DM_FLAGS_IALLOCSEM_WR)
+ up_write(&ip->i_alloc_sem);
+ if (flags & DM_FLAGS_IMUX)
+ mutex_unlock(&ip->i_mutex);
+}
+
+static void down_rw_sems(struct inode *ip, int flags)
+{
+ if (flags & DM_FLAGS_IMUX)
+ mutex_lock(&ip->i_mutex);
+ if (flags & DM_FLAGS_IALLOCSEM_WR)
+ down_write(&ip->i_alloc_sem);
+}
+
+
+/* Structure used to hold the on-disk version of a dm_attrname_t. All
+ on-disk attribute names start with the 8-byte string "SGI_DMI_".
+*/
+
+typedef struct {
+ char dan_chars[DMATTR_PREFIXLEN + DM_ATTR_NAME_SIZE + 1];
+} dm_dkattrname_t;
+
+/* Structure used by xfs_dm_get_bulkall(), used as the "private_data"
+ * that we want xfs_bulkstat to send to our formatter.
+ */
+typedef struct {
+ dm_fsid_t fsid;
+ void __user *laststruct;
+ dm_dkattrname_t attrname;
+} dm_bulkstat_one_t;
+
+/* In the on-disk inode, DMAPI attribute names consist of the user-provided
+ name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be
+ changed!
+*/
+
+static const char dmattr_prefix[DMATTR_PREFIXLEN + 1] = DMATTR_PREFIXSTRING;
+
+static dm_size_t dm_min_dio_xfer = 0; /* direct I/O disabled for now */
+
+
+/* See xfs_dm_get_dmattr() for a description of why this is needed. */
+
+#define XFS_BUG_KLUDGE 256 /* max size of an in-inode attribute value */
+
+#define DM_MAX_ATTR_BYTES_ON_DESTROY 256
+
+#define DM_STAT_SIZE(dmtype,namelen) \
+ (sizeof(dmtype) + sizeof(dm_handle_t) + namelen)
+
+#define DM_STAT_ALIGN (sizeof(__uint64_t))
+
+/* DMAPI's E2BIG == EA's ERANGE */
+#define DM_EA_XLATE_ERR(err) { if (err == ERANGE) err = E2BIG; }
+
+static inline size_t dm_stat_align(size_t size)
+{
+ return (size + (DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+}
+
+static inline size_t dm_stat_size(size_t namelen)
+{
+ return dm_stat_align(sizeof(dm_stat_t) + sizeof(dm_handle_t) + namelen);
+}
+
+/*
+ * xfs_dm_send_data_event()
+ *
+ * Send data event to DMAPI. Drop IO lock (if specified) before
+ * the dm_send_data_event() call and reacquire it afterwards.
+ */
+int
+xfs_dm_send_data_event(
+ dm_eventtype_t event,
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ size_t length,
+ int flags,
+ int *lock_flags)
+{
+ struct inode *inode = &ip->i_vnode;
+ int error;
+ uint16_t dmstate;
+
+ /* Returns positive errors to XFS */
+
+ do {
+ dmstate = ip->i_d.di_dmstate;
+ if (lock_flags)
+ xfs_iunlock(ip, *lock_flags);
+
+ up_rw_sems(inode, flags);
+
+ error = dm_send_data_event(event, inode, DM_RIGHT_NULL,
+ offset, length, flags);
+ error = -error; /* DMAPI returns negative errors */
+
+ down_rw_sems(inode, flags);
+
+ if (lock_flags)
+ xfs_ilock(ip, *lock_flags);
+ } while (!error && (ip->i_d.di_dmstate != dmstate));
+
+ return error;
+}
+
+/* prohibited_mr_events
+ *
+ * Return event bits representing any events which cannot have managed
+ * region events set due to memory mapping of the file. If the maximum
+ * protection allowed in any pregion includes PROT_WRITE, and the region
+ * is shared and not text, then neither READ nor WRITE events can be set.
+ * Otherwise if the file is memory mapped, no READ event can be set.
+ *
+ */
+STATIC int
+prohibited_mr_events(
+ struct address_space *mapping)
+{
+ int prohibited = (1 << DM_EVENT_READ);
+
+ if (!mapping_mapped(mapping))
+ return 0;
+
+ spin_lock(&mapping->i_mmap_lock);
+ if (mapping_writably_mapped(mapping))
+ prohibited |= (1 << DM_EVENT_WRITE);
+ spin_unlock(&mapping->i_mmap_lock);
+
+ return prohibited;
+}
+
+#ifdef DEBUG_RIGHTS
+STATIC int
+xfs_vp_to_hexhandle(
+ struct inode *inode,
+ u_int type,
+ char *buffer)
+{
+ dm_handle_t handle;
+ u_char *ip;
+ int length;
+ int error;
+ int i;
+
+ /*
+ * XXX: dm_vp_to_handle doesn't exist.
+ * Looks like this debug code is rather dead.
+ */
+ if ((error = dm_vp_to_handle(inode, &handle)))
+ return(error);
+
+ if (type == DM_FSYS_OBJ) { /* a filesystem handle */
+ length = DM_FSHSIZE;
+ } else {
+ length = DM_HSIZE(handle);
+ }
+ for (ip = (u_char *)&handle, i = 0; i < length; i++) {
+ *buffer++ = "0123456789abcdef"[ip[i] >> 4];
+ *buffer++ = "0123456789abcdef"[ip[i] & 0xf];
+ }
+ *buffer = '\0';
+ return(0);
+}
+#endif /* DEBUG_RIGHTS */
+
+
+
+
+/* Copy in and validate an attribute name from user space. It should be a
+ string of at least one and at most DM_ATTR_NAME_SIZE characters. Because
+ the dm_attrname_t structure doesn't provide room for the trailing NULL
+ byte, we just copy in one extra character and then zero it if it
+ happens to be non-NULL.
+*/
+
+STATIC int
+xfs_copyin_attrname(
+ dm_attrname_t __user *from, /* dm_attrname_t in user space */
+ dm_dkattrname_t *to) /* name buffer in kernel space */
+{
+ int error = 0;
+ size_t len;
+
+ strcpy(to->dan_chars, dmattr_prefix);
+
+ len = strnlen_user((char __user *)from, DM_ATTR_NAME_SIZE);
+ if (len == 0)
+ error = EFAULT;
+ else {
+ if (copy_from_user(&to->dan_chars[DMATTR_PREFIXLEN], from, len))
+ to->dan_chars[sizeof(to->dan_chars) - 1] = '\0';
+ else if (to->dan_chars[DMATTR_PREFIXLEN] == '\0')
+ error = EINVAL;
+ else
+ to->dan_chars[DMATTR_PREFIXLEN + len - 1] = '\0';
+ }
+
+ return error;
+}
+
+
+/*
+ * Convert the XFS flags into their DMAPI flag equivalent for export
+ */
+STATIC uint
+_xfs_dic2dmflags(
+ __uint16_t di_flags)
+{
+ uint flags = 0;
+
+ if (di_flags & XFS_DIFLAG_ANY) {
+ if (di_flags & XFS_DIFLAG_REALTIME)
+ flags |= DM_XFLAG_REALTIME;
+ if (di_flags & XFS_DIFLAG_PREALLOC)
+ flags |= DM_XFLAG_PREALLOC;
+ if (di_flags & XFS_DIFLAG_IMMUTABLE)
+ flags |= DM_XFLAG_IMMUTABLE;
+ if (di_flags & XFS_DIFLAG_APPEND)
+ flags |= DM_XFLAG_APPEND;
+ if (di_flags & XFS_DIFLAG_SYNC)
+ flags |= DM_XFLAG_SYNC;
+ if (di_flags & XFS_DIFLAG_NOATIME)
+ flags |= DM_XFLAG_NOATIME;
+ if (di_flags & XFS_DIFLAG_NODUMP)
+ flags |= DM_XFLAG_NODUMP;
+ }
+ return flags;
+}
+
+STATIC uint
+xfs_ip2dmflags(
+ xfs_inode_t *ip)
+{
+ return _xfs_dic2dmflags(ip->i_d.di_flags) |
+ (XFS_IFORK_Q(ip) ? DM_XFLAG_HASATTR : 0);
+}
+
+STATIC uint
+xfs_dic2dmflags(
+ xfs_dinode_t *dip)
+{
+ return _xfs_dic2dmflags(be16_to_cpu(dip->di_flags)) |
+ (XFS_DFORK_Q(dip) ? DM_XFLAG_HASATTR : 0);
+}
+
+/*
+ * This copies selected fields in an inode into a dm_stat structure. Because
+ * these fields must return the same values as they would in stat(), the
+ * majority of this code was copied directly from xfs_getattr(). Any future
+ * changes to xfs_gettattr() must also be reflected here.
+ */
+STATIC void
+xfs_dip_to_stat(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ xfs_dinode_t *dip,
+ dm_stat_t *buf)
+{
+ xfs_dinode_t *dic = dip;
+
+ /*
+ * The inode format changed when we moved the link count and
+ * made it 32 bits long. If this is an old format inode,
+ * convert it in memory to look like a new one. If it gets
+ * flushed to disk we will convert back before flushing or
+ * logging it. We zero out the new projid field and the old link
+ * count field. We'll handle clearing the pad field (the remains
+ * of the old uuid field) when we actually convert the inode to
+ * the new format. We don't change the version number so that we
+ * can distinguish this from a real new format inode.
+ */
+ if (dic->di_version == 1) {
+ buf->dt_nlink = be16_to_cpu(dic->di_onlink);
+ /*buf->dt_xfs_projid = 0;*/
+ } else {
+ buf->dt_nlink = be32_to_cpu(dic->di_nlink);
+ /*buf->dt_xfs_projid = be16_to_cpu(dic->di_projid);*/
+ }
+ buf->dt_ino = ino;
+ buf->dt_dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ buf->dt_mode = be16_to_cpu(dic->di_mode);
+ buf->dt_uid = be32_to_cpu(dic->di_uid);
+ buf->dt_gid = be32_to_cpu(dic->di_gid);
+ buf->dt_size = be64_to_cpu(dic->di_size);
+ buf->dt_atime = be32_to_cpu(dic->di_atime.t_sec);
+ buf->dt_mtime = be32_to_cpu(dic->di_mtime.t_sec);
+ buf->dt_ctime = be32_to_cpu(dic->di_ctime.t_sec);
+ buf->dt_xfs_xflags = xfs_dic2dmflags(dip);
+ buf->dt_xfs_extsize =
+ be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
+ buf->dt_xfs_extents = be32_to_cpu(dic->di_nextents);
+ buf->dt_xfs_aextents = be16_to_cpu(dic->di_anextents);
+ buf->dt_xfs_igen = be32_to_cpu(dic->di_gen);
+ buf->dt_xfs_dmstate = be16_to_cpu(dic->di_dmstate);
+
+ switch (dic->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ buf->dt_rdev = xfs_dinode_get_rdev(dic);
+ buf->dt_blksize = BLKDEV_IOSIZE;
+ buf->dt_blocks = 0;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_UUID:
+ buf->dt_rdev = 0;
+ buf->dt_blksize = mp->m_sb.sb_blocksize;
+ buf->dt_blocks = 0;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ buf->dt_rdev = 0;
+ buf->dt_blksize = mp->m_sb.sb_blocksize;
+ buf->dt_blocks =
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dic->di_nblocks));
+ break;
+ }
+
+ memset(&buf->dt_pad1, 0, sizeof(buf->dt_pad1));
+ memset(&buf->dt_pad2, 0, sizeof(buf->dt_pad2));
+ memset(&buf->dt_pad3, 0, sizeof(buf->dt_pad3));
+
+ /* Finally fill in the DMAPI specific fields */
+ buf->dt_pers = 0;
+ buf->dt_change = 0;
+ buf->dt_nevents = DM_EVENT_MAX;
+ buf->dt_emask = be32_to_cpu(dic->di_dmevmask);
+ buf->dt_dtime = be32_to_cpu(dic->di_ctime.t_sec);
+ /* Set if one of READ, WRITE or TRUNCATE bits is set in emask */
+ buf->dt_pmanreg = (DMEV_ISSET(DM_EVENT_READ, buf->dt_emask) ||
+ DMEV_ISSET(DM_EVENT_WRITE, buf->dt_emask) ||
+ DMEV_ISSET(DM_EVENT_TRUNCATE, buf->dt_emask)) ? 1 : 0;
+}
+
+/*
+ * Pull out both ondisk and incore fields, incore has preference.
+ * The inode must be kept locked SHARED by the caller.
+ */
+STATIC void
+xfs_ip_to_stat(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ xfs_inode_t *ip,
+ dm_stat_t *buf)
+{
+ xfs_icdinode_t *dic = &ip->i_d;
+
+ buf->dt_ino = ino;
+ buf->dt_nlink = dic->di_nlink;
+ /*buf->dt_xfs_projid = dic->di_projid;*/
+ buf->dt_mode = dic->di_mode;
+ buf->dt_uid = dic->di_uid;
+ buf->dt_gid = dic->di_gid;
+ buf->dt_size = XFS_ISIZE(ip);
+ buf->dt_dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ buf->dt_atime = VFS_I(ip)->i_atime.tv_sec;
+ buf->dt_mtime = dic->di_mtime.t_sec;
+ buf->dt_ctime = dic->di_ctime.t_sec;
+ buf->dt_xfs_xflags = xfs_ip2dmflags(ip);
+ buf->dt_xfs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+ buf->dt_xfs_extents = dic->di_nextents;
+ buf->dt_xfs_aextents = dic->di_anextents;
+ buf->dt_xfs_igen = dic->di_gen;
+ buf->dt_xfs_dmstate = dic->di_dmstate;
+
+ switch (dic->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ buf->dt_rdev = ip->i_df.if_u2.if_rdev;
+ buf->dt_blksize = BLKDEV_IOSIZE;
+ buf->dt_blocks = 0;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_UUID:
+ buf->dt_rdev = 0;
+ buf->dt_blksize = mp->m_sb.sb_blocksize;
+ buf->dt_blocks = 0;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ buf->dt_rdev = 0;
+ buf->dt_blksize = mp->m_sb.sb_blocksize;
+ buf->dt_blocks = XFS_FSB_TO_BB(mp,
+ (dic->di_nblocks + ip->i_delayed_blks));
+ break;
+ }
+
+ memset(&buf->dt_pad1, 0, sizeof(buf->dt_pad1));
+ memset(&buf->dt_pad2, 0, sizeof(buf->dt_pad2));
+ memset(&buf->dt_pad3, 0, sizeof(buf->dt_pad3));
+
+ /* Finally fill in the DMAPI specific fields */
+ buf->dt_pers = 0;
+ buf->dt_change = 0;
+ buf->dt_nevents = DM_EVENT_MAX;
+ buf->dt_emask = dic->di_dmevmask;
+ buf->dt_dtime = dic->di_ctime.t_sec;
+ /* Set if one of READ, WRITE or TRUNCATE bits is set in emask */
+ buf->dt_pmanreg = (DMEV_ISSET(DM_EVENT_READ, buf->dt_emask) ||
+ DMEV_ISSET(DM_EVENT_WRITE, buf->dt_emask) ||
+ DMEV_ISSET(DM_EVENT_TRUNCATE, buf->dt_emask)) ? 1 : 0;
+}
+
+/*
+ * Take the handle and put it at the end of a dm_xstat buffer.
+ * dt_compname is unused in bulkstat - so we zero it out.
+ * Finally, update link in dm_xstat_t to point to next struct.
+ */
+STATIC void
+xfs_dm_handle_to_xstat(
+ dm_xstat_t *xbuf,
+ size_t xstat_sz,
+ dm_handle_t *handle,
+ size_t handle_sz)
+{
+ dm_stat_t *sbuf = &xbuf->dx_statinfo;
+
+ memcpy(xbuf + 1, handle, handle_sz);
+ sbuf->dt_handle.vd_offset = (ssize_t) sizeof(dm_xstat_t);
+ sbuf->dt_handle.vd_length = (size_t) DM_HSIZE(*handle);
+ memset(&sbuf->dt_compname, 0, sizeof(dm_vardata_t));
+ sbuf->_link = xstat_sz;
+}
+
+STATIC int
+xfs_dm_bulkall_iget_one(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ xfs_daddr_t bno,
+ int *value_lenp,
+ dm_xstat_t *xbuf,
+ u_int *xstat_szp,
+ char *attr_name,
+ caddr_t attr_buf)
+{
+ xfs_inode_t *ip;
+ dm_handle_t handle;
+ u_int xstat_sz = *xstat_szp;
+ int value_len = *value_lenp;
+ int error;
+
+ error = xfs_iget(mp, NULL, ino,
+ XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
+ if (error)
+ return error;
+
+ xfs_ip_to_stat(mp, ino, ip, &xbuf->dx_statinfo);
+ dm_ip_to_handle(&ip->i_vnode, &handle);
+ xfs_dm_handle_to_xstat(xbuf, xstat_sz, &handle, sizeof(handle));
+
+ /* Drop ILOCK_SHARED for call to xfs_attr_get */
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ memset(&xbuf->dx_attrdata, 0, sizeof(dm_vardata_t));
+ error = xfs_attr_get(ip, attr_name, attr_buf, &value_len, ATTR_ROOT);
+ iput(&ip->i_vnode);
+
+ DM_EA_XLATE_ERR(error);
+ if (error && (error != ENOATTR)) {
+ if (error == E2BIG)
+ error = ENOMEM;
+ return error;
+ }
+
+ /* How much space was in the attr? */
+ if (error != ENOATTR) {
+ xbuf->dx_attrdata.vd_offset = xstat_sz;
+ xbuf->dx_attrdata.vd_length = value_len;
+ xstat_sz += (value_len+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ }
+ *xstat_szp = xbuf->dx_statinfo._link = xstat_sz;
+ *value_lenp = value_len;
+ return 0;
+}
+
+
+STATIC int
+xfs_dm_inline_attr(
+ xfs_mount_t *mp,
+ xfs_dinode_t *dip,
+ char *attr_name,
+ caddr_t attr_buf,
+ int *value_lenp)
+{
+ if (dip->di_aformat == XFS_DINODE_FMT_LOCAL) {
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ unsigned int namelen = strlen(attr_name);
+ unsigned int valuelen = *value_lenp;
+ int i;
+
+ sf = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ if (sfe->namelen != namelen)
+ continue;
+ if (!(sfe->flags & XFS_ATTR_ROOT))
+ continue;
+ if (memcmp(attr_name, sfe->nameval, namelen) != 0)
+ continue;
+ if (valuelen < sfe->valuelen)
+ return ERANGE;
+ valuelen = sfe->valuelen;
+ memcpy(attr_buf, &sfe->nameval[namelen], valuelen);
+ *value_lenp = valuelen;
+ return 0;
+ }
+ }
+ *value_lenp = 0;
+ return ENOATTR;
+}
+
+STATIC void
+dm_dip_to_handle(
+ xfs_ino_t ino,
+ xfs_dinode_t *dip,
+ dm_fsid_t *fsid,
+ dm_handle_t *handlep)
+{
+ dm_fid_t fid;
+ int hsize;
+
+ fid.dm_fid_len = sizeof(struct dm_fid) - sizeof(fid.dm_fid_len);
+ fid.dm_fid_pad = 0;
+ fid.dm_fid_ino = ino;
+ fid.dm_fid_gen = be32_to_cpu(dip->di_gen);
+
+ memcpy(&handlep->ha_fsid, fsid, sizeof(*fsid));
+ memcpy(&handlep->ha_fid, &fid, fid.dm_fid_len + sizeof(fid.dm_fid_len));
+ hsize = DM_HSIZE(*handlep);
+ memset((char *)handlep + hsize, 0, sizeof(*handlep) - hsize);
+}
+
+STATIC int
+xfs_dm_bulkall_inline_one(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ xfs_dinode_t *dip,
+ dm_fsid_t *fsid,
+ int *value_lenp,
+ dm_xstat_t *xbuf,
+ u_int *xstat_szp,
+ char *attr_name,
+ caddr_t attr_buf)
+{
+ dm_handle_t handle;
+ u_int xstat_sz = *xstat_szp;
+ int value_len = *value_lenp;
+ int error;
+
+ if (dip->di_mode == 0)
+ return ENOENT;
+
+ xfs_dip_to_stat(mp, ino, dip, &xbuf->dx_statinfo);
+ dm_dip_to_handle(ino, dip, fsid, &handle);
+ xfs_dm_handle_to_xstat(xbuf, xstat_sz, &handle, sizeof(handle));
+
+ memset(&xbuf->dx_attrdata, 0, sizeof(dm_vardata_t));
+ error = xfs_dm_inline_attr(mp, dip, attr_name, attr_buf, &value_len);
+ DM_EA_XLATE_ERR(error);
+ if (error && (error != ENOATTR)) {
+ if (error == E2BIG)
+ error = ENOMEM;
+ return error;
+ }
+
+ /* How much space was in the attr? */
+ if (error != ENOATTR) {
+ xbuf->dx_attrdata.vd_offset = xstat_sz;
+ xbuf->dx_attrdata.vd_length = value_len;
+ xstat_sz += (value_len+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ }
+ *xstat_szp = xbuf->dx_statinfo._link = xstat_sz;
+ *value_lenp = value_len;
+ return 0;
+}
+
+/*
+ * This is used by dm_get_bulkall().
+ * Given a inumber, it igets the inode and fills the given buffer
+ * with the dm_xstat structure for the file.
+ */
+STATIC int
+xfs_dm_bulkall_one(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ void *private_data, /* my private data */
+ xfs_daddr_t bno, /* starting block of inode cluster */
+ int *ubused, /* amount of buffer we used */
+ void *dibuff, /* on-disk inode buffer */
+ int *res) /* bulkstat result code */
+{
+ dm_xstat_t *xbuf;
+ u_int xstat_sz;
+ int error;
+ int value_len;
+ int kern_buf_sz;
+ int attr_buf_sz;
+ caddr_t attr_buf;
+ void __user *attr_user_buf;
+ dm_bulkstat_one_t *dmb = (dm_bulkstat_one_t*)private_data;
+
+ /* Returns positive errors to XFS */
+
+ *res = BULKSTAT_RV_NOTHING;
+
+ if (!buffer || xfs_internal_inum(mp, ino))
+ return EINVAL;
+
+ xstat_sz = DM_STAT_SIZE(*xbuf, 0);
+ xstat_sz = (xstat_sz + (DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ if (xstat_sz > ubsize)
+ return ENOMEM;
+
+ kern_buf_sz = xstat_sz;
+ xbuf = kmem_alloc(kern_buf_sz, KM_SLEEP);
+
+ /* Determine place to drop attr value, and available space. */
+ value_len = ubsize - xstat_sz;
+ if (value_len > ATTR_MAX_VALUELEN)
+ value_len = ATTR_MAX_VALUELEN;
+
+ attr_user_buf = buffer + xstat_sz;
+ attr_buf_sz = value_len;
+ attr_buf = kmem_alloc(attr_buf_sz, KM_SLEEP);
+
+ if (!dibuff)
+ error = xfs_dm_bulkall_iget_one(mp, ino, bno,
+ &value_len, xbuf, &xstat_sz,
+ dmb->attrname.dan_chars,
+ attr_buf);
+ else
+ error = xfs_dm_bulkall_inline_one(mp, ino,
+ (xfs_dinode_t *)dibuff,
+ &dmb->fsid,
+ &value_len, xbuf, &xstat_sz,
+ dmb->attrname.dan_chars,
+ attr_buf);
+ if (error)
+ goto out_free_buffers;
+
+ if (copy_to_user(buffer, xbuf, kern_buf_sz)) {
+ error = EFAULT;
+ goto out_free_buffers;
+ }
+ if (copy_to_user(attr_user_buf, attr_buf, value_len)) {
+ error = EFAULT;
+ goto out_free_buffers;
+ }
+
+ kmem_free(attr_buf);
+ kmem_free(xbuf);
+
+ *res = BULKSTAT_RV_DIDONE;
+ if (ubused)
+ *ubused = xstat_sz;
+ dmb->laststruct = buffer;
+ return 0;
+
+ out_free_buffers:
+ kmem_free(attr_buf);
+ kmem_free(xbuf);
+ return error;
+}
+
+/*
+ * Take the handle and put it at the end of a dm_stat buffer.
+ * dt_compname is unused in bulkstat - so we zero it out.
+ * Finally, update link in dm_stat_t to point to next struct.
+ */
+STATIC void
+xfs_dm_handle_to_stat(
+ dm_stat_t *sbuf,
+ size_t stat_sz,
+ dm_handle_t *handle,
+ size_t handle_sz)
+{
+ memcpy(sbuf + 1, handle, handle_sz);
+ sbuf->dt_handle.vd_offset = (ssize_t) sizeof(dm_stat_t);
+ sbuf->dt_handle.vd_length = (size_t) DM_HSIZE(*handle);
+ memset(&sbuf->dt_compname, 0, sizeof(dm_vardata_t));
+ sbuf->_link = stat_sz;
+}
+
+STATIC int
+xfs_dm_bulkattr_iget_one(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ xfs_daddr_t bno,
+ dm_stat_t *sbuf,
+ u_int stat_sz)
+{
+ xfs_inode_t *ip;
+ dm_handle_t handle;
+ int error;
+
+ error = xfs_iget(mp, NULL, ino,
+ XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
+ if (error)
+ return error;
+
+ xfs_ip_to_stat(mp, ino, ip, sbuf);
+ dm_ip_to_handle(&ip->i_vnode, &handle);
+ xfs_dm_handle_to_stat(sbuf, stat_sz, &handle, sizeof(handle));
+
+ xfs_iput(ip, XFS_ILOCK_SHARED);
+ return 0;
+}
+
+STATIC int
+xfs_dm_bulkattr_inline_one(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ xfs_dinode_t *dip,
+ dm_fsid_t *fsid,
+ dm_stat_t *sbuf,
+ u_int stat_sz)
+{
+ dm_handle_t handle;
+
+ if (dip->di_mode == 0)
+ return ENOENT;
+ xfs_dip_to_stat(mp, ino, dip, sbuf);
+ dm_dip_to_handle(ino, dip, fsid, &handle);
+ xfs_dm_handle_to_stat(sbuf, stat_sz, &handle, sizeof(handle));
+ return 0;
+}
+
+/*
+ * This is used by dm_get_bulkattr().
+ * Given a inumber, it igets the inode and fills the given buffer
+ * with the dm_stat structure for the file.
+ */
+STATIC int
+xfs_dm_bulkattr_one(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ void *private_data, /* my private data */
+ xfs_daddr_t bno, /* starting block of inode cluster */
+ int *ubused, /* amount of buffer we used */
+ void *dibuff, /* on-disk inode buffer */
+ int *res) /* bulkstat result code */
+{
+ dm_stat_t *sbuf;
+ u_int stat_sz;
+ int error;
+ dm_bulkstat_one_t *dmb = (dm_bulkstat_one_t*)private_data;
+
+ /* Returns positive errors to XFS */
+
+ *res = BULKSTAT_RV_NOTHING;
+
+ if (!buffer || xfs_internal_inum(mp, ino))
+ return EINVAL;
+
+ stat_sz = DM_STAT_SIZE(*sbuf, 0);
+ stat_sz = (stat_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ if (stat_sz > ubsize)
+ return ENOMEM;
+
+ sbuf = kmem_alloc(stat_sz, KM_SLEEP);
+
+ if (!dibuff)
+ error = xfs_dm_bulkattr_iget_one(mp, ino, bno, sbuf, stat_sz);
+ else
+ error = xfs_dm_bulkattr_inline_one(mp, ino,
+ (xfs_dinode_t *)dibuff,
+ &dmb->fsid, sbuf, stat_sz);
+ if (error)
+ goto out_free_buffer;
+
+ if (copy_to_user(buffer, sbuf, stat_sz)) {
+ error = EFAULT;
+ goto out_free_buffer;
+ }
+
+ kmem_free(sbuf);
+ *res = BULKSTAT_RV_DIDONE;
+ if (ubused)
+ *ubused = stat_sz;
+ dmb->laststruct = buffer;
+ return 0;
+
+ out_free_buffer:
+ kmem_free(sbuf);
+ return error;
+}
+
+/* xfs_dm_f_get_eventlist - return the dm_eventset_t mask for inode ip. */
+
+STATIC int
+xfs_dm_f_get_eventlist(
+ xfs_inode_t *ip,
+ dm_right_t right,
+ u_int nelem,
+ dm_eventset_t *eventsetp, /* in kernel space! */
+ u_int *nelemp) /* in kernel space! */
+{
+ dm_eventset_t eventset;
+
+ if (right < DM_RIGHT_SHARED)
+ return(EACCES);
+
+ /* Note that we MUST return a regular file's managed region bits as
+ part of the mask because dm_get_eventlist is supposed to return the
+ union of all managed region flags in those bits. Since we only
+ support one region, we can just return the bits as they are. For
+ all other object types, the bits will already be zero. Handy, huh?
+ */
+
+ eventset = ip->i_d.di_dmevmask;
+
+ /* Now copy the event mask and event count back to the caller. We
+ return the lesser of nelem and DM_EVENT_MAX.
+ */
+
+ if (nelem > DM_EVENT_MAX)
+ nelem = DM_EVENT_MAX;
+ eventset &= (1 << nelem) - 1;
+
+ *eventsetp = eventset;
+ *nelemp = nelem;
+ return(0);
+}
+
+
+/* xfs_dm_f_set_eventlist - update the dm_eventset_t mask in the inode vp. Only the
+ bits from zero to maxevent-1 are being replaced; higher bits are preserved.
+*/
+
+STATIC int
+xfs_dm_f_set_eventlist(
+ xfs_inode_t *ip,
+ dm_right_t right,
+ dm_eventset_t *eventsetp, /* in kernel space! */
+ u_int maxevent)
+{
+ dm_eventset_t eventset;
+ dm_eventset_t max_mask;
+ dm_eventset_t valid_events;
+ xfs_trans_t *tp;
+ xfs_mount_t *mp;
+ int error;
+
+ if (right < DM_RIGHT_EXCL)
+ return(EACCES);
+
+ eventset = *eventsetp;
+ if (maxevent >= sizeof(ip->i_d.di_dmevmask) * NBBY)
+ return(EINVAL);
+ max_mask = (1 << maxevent) - 1;
+
+ if (S_ISDIR(ip->i_d.di_mode)) {
+ valid_events = DM_XFS_VALID_DIRECTORY_EVENTS;
+ } else { /* file or symlink */
+ valid_events = DM_XFS_VALID_FILE_EVENTS;
+ }
+ if ((eventset & max_mask) & ~valid_events)
+ return(EINVAL);
+
+ /* Adjust the event mask so that the managed region bits will not
+ be altered.
+ */
+
+ max_mask &= ~(1 <<DM_EVENT_READ); /* preserve current MR bits */
+ max_mask &= ~(1 <<DM_EVENT_WRITE);
+ max_mask &= ~(1 <<DM_EVENT_TRUNCATE);
+
+ mp = ip->i_mount;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+ error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return(error);
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ ip->i_d.di_dmevmask = (eventset & max_mask) | (ip->i_d.di_dmevmask & ~max_mask);
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ igrab(&ip->i_vnode);
+ xfs_trans_commit(tp, 0);
+
+ return(0);
+}
+
+
+/* xfs_dm_fs_get_eventlist - return the dm_eventset_t mask for filesystem vfsp. */
+
+STATIC int
+xfs_dm_fs_get_eventlist(
+ xfs_mount_t *mp,
+ dm_right_t right,
+ u_int nelem,
+ dm_eventset_t *eventsetp, /* in kernel space! */
+ u_int *nelemp) /* in kernel space! */
+{
+ dm_eventset_t eventset;
+
+ if (right < DM_RIGHT_SHARED)
+ return(EACCES);
+
+ eventset = mp->m_dmevmask;
+
+ /* Now copy the event mask and event count back to the caller. We
+ return the lesser of nelem and DM_EVENT_MAX.
+ */
+
+ if (nelem > DM_EVENT_MAX)
+ nelem = DM_EVENT_MAX;
+ eventset &= (1 << nelem) - 1;
+
+ *eventsetp = eventset;
+ *nelemp = nelem;
+ return(0);
+}
+
+
+/* xfs_dm_fs_set_eventlist - update the dm_eventset_t mask in the mount structure for
+ filesystem vfsp. Only the bits from zero to maxevent-1 are being replaced;
+ higher bits are preserved.
+*/
+
+STATIC int
+xfs_dm_fs_set_eventlist(
+ xfs_mount_t *mp,
+ dm_right_t right,
+ dm_eventset_t *eventsetp, /* in kernel space! */
+ u_int maxevent)
+{
+ dm_eventset_t eventset;
+ dm_eventset_t max_mask;
+
+ if (right < DM_RIGHT_EXCL)
+ return(EACCES);
+
+ eventset = *eventsetp;
+
+ if (maxevent >= sizeof(mp->m_dmevmask) * NBBY)
+ return(EINVAL);
+ max_mask = (1 << maxevent) - 1;
+
+ if ((eventset & max_mask) & ~DM_XFS_VALID_FS_EVENTS)
+ return(EINVAL);
+
+ mp->m_dmevmask = (eventset & max_mask) | (mp->m_dmevmask & ~max_mask);
+ return(0);
+}
+
+
+/* Code in this routine must exactly match the logic in xfs_diordwr() in
+ order for this to work!
+*/
+
+STATIC int
+xfs_dm_direct_ok(
+ xfs_inode_t *ip,
+ dm_off_t off,
+ dm_size_t len,
+ void __user *bufp)
+{
+ xfs_mount_t *mp;
+
+ mp = ip->i_mount;
+
+ /* Realtime files can ONLY do direct I/O. */
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ return(1);
+
+ /* If direct I/O is disabled, or if the request is too small, use
+ buffered I/O.
+ */
+
+ if (!dm_min_dio_xfer || len < dm_min_dio_xfer)
+ return(0);
+
+#if 0
+ /* If the request is not well-formed or is too large, use
+ buffered I/O.
+ */
+
+ if ((__psint_t)bufp & scache_linemask) /* if buffer not aligned */
+ return(0);
+ if (off & mp->m_blockmask) /* if file offset not aligned */
+ return(0);
+ if (len & mp->m_blockmask) /* if xfer length not aligned */
+ return(0);
+ if (len > ctooff(v.v_maxdmasz - 1)) /* if transfer too large */
+ return(0);
+
+ /* A valid direct I/O candidate. */
+
+ return(1);
+#else
+ return(0);
+#endif
+}
+
+
+/* We need to be able to select various combinations of O_NONBLOCK,
+ O_DIRECT, and O_SYNC, yet we don't have a file descriptor and we don't have
+ the file's pathname. All we have is a handle.
+*/
+
+STATIC int
+xfs_dm_rdwr(
+ struct inode *inode,
+ uint fflag,
+ mode_t fmode,
+ dm_off_t off,
+ dm_size_t len,
+ void __user *bufp,
+ int *rvp)
+{
+ const struct cred *cred = current_cred();
+ xfs_inode_t *ip = XFS_I(inode);
+ int error;
+ int oflags;
+ ssize_t xfer;
+ struct file *file;
+ struct dentry *dentry;
+
+ if ((off < 0) || (off > i_size_read(inode)) || !S_ISREG(inode->i_mode))
+ return EINVAL;
+
+ if (fmode & FMODE_READ) {
+ oflags = O_RDONLY;
+ } else {
+ oflags = O_WRONLY;
+ }
+
+ /*
+ * Build file descriptor flags and I/O flags. O_NONBLOCK is needed so
+ * that we don't block on mandatory file locks. This is an invisible IO,
+ * don't change the atime.
+ */
+
+ oflags |= O_LARGEFILE | O_NONBLOCK | O_NOATIME;
+ if (xfs_dm_direct_ok(ip, off, len, bufp))
+ oflags |= O_DIRECT;
+
+ if (fflag & O_SYNC)
+ oflags |= O_SYNC;
+
+ if (inode->i_fop == NULL) {
+ /* no iput; caller did get, and will do put */
+ return EINVAL;
+ }
+
+ igrab(inode);
+
+ dentry = d_obtain_alias(inode);
+ if (dentry == NULL) {
+ iput(inode);
+ return ENOMEM;
+ }
+
+ file = dentry_open(dentry, mntget(ip->i_mount->m_vfsmount), oflags,
+ cred);
+ if (IS_ERR(file)) {
+ return -PTR_ERR(file);
+ }
+ file->f_mode |= FMODE_NOCMTIME;
+
+ if (fmode & FMODE_READ) {
+ xfer = file->f_op->read(file, bufp, len, (loff_t*)&off);
+ } else {
+ xfer = file->f_op->write(file, bufp, len, (loff_t*)&off);
+ }
+
+ if (xfer >= 0) {
+ *rvp = xfer;
+ error = 0;
+ } else {
+ /* xfs_read/xfs_write return negative error--flip it */
+ error = -(int)xfer;
+ }
+
+ fput(file);
+ return error;
+}
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_clear_inherit(
+ struct inode *inode,
+ dm_right_t right,
+ dm_attrname_t __user *attrnamep)
+{
+ return(-ENOSYS); /* Return negative error to DMAPI */
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_create_by_handle(
+ struct inode *inode,
+ dm_right_t right,
+ void __user *hanp,
+ size_t hlen,
+ char __user *cname)
+{
+ return(-ENOSYS); /* Return negative error to DMAPI */
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_downgrade_right(
+ struct inode *inode,
+ dm_right_t right,
+ u_int type) /* DM_FSYS_OBJ or zero */
+{
+#ifdef DEBUG_RIGHTS
+ char buffer[sizeof(dm_handle_t) * 2 + 1];
+
+ if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ printf("dm_downgrade_right: old %d new %d type %d handle %s\n",
+ right, DM_RIGHT_SHARED, type, buffer);
+ } else {
+ printf("dm_downgrade_right: old %d new %d type %d handle "
+ "<INVALID>\n", right, DM_RIGHT_SHARED, type);
+ }
+#endif /* DEBUG_RIGHTS */
+ return(0);
+}
+
+
+/* Note: xfs_dm_get_allocinfo() makes no attempt to coalesce two adjacent
+ extents when both are of type DM_EXTENT_RES; this is left to the caller.
+ XFS guarantees that there will never be two adjacent DM_EXTENT_HOLE extents.
+
+ In order to provide the caller with all extents in a file including
+ those beyond the file's last byte offset, we have to use the xfs_bmapi()
+ interface.
+*/
+
+STATIC int
+xfs_dm_get_allocinfo_rvp(
+ struct inode *inode,
+ dm_right_t right,
+ dm_off_t __user *offp,
+ u_int nelem,
+ dm_extent_t __user *extentp,
+ u_int __user *nelemp,
+ int *rvp)
+{
+ xfs_inode_t *ip = XFS_I(inode);
+ xfs_mount_t *mp; /* file system mount point */
+ xfs_fileoff_t fsb_offset;
+ xfs_filblks_t fsb_length;
+ dm_off_t startoff;
+ int elem;
+ xfs_bmbt_irec_t *bmp = NULL;
+ u_int bmpcnt = 50;
+ u_int bmpsz = sizeof(xfs_bmbt_irec_t) * bmpcnt;
+ int error = 0;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ if ((inode->i_mode & S_IFMT) != S_IFREG)
+ return(-EINVAL);
+
+ if (copy_from_user( &startoff, offp, sizeof(startoff)))
+ return(-EFAULT);
+
+ mp = ip->i_mount;
+ ASSERT(mp);
+
+ if (startoff > XFS_MAXIOFFSET(mp))
+ return(-EINVAL);
+
+ if (nelem == 0)
+ return(-EINVAL);
+
+ /* Convert the caller's starting offset into filesystem allocation
+ units as required by xfs_bmapi(). Round the offset down so that
+ it is sure to be included in the reply.
+ */
+
+ fsb_offset = XFS_B_TO_FSBT(mp, startoff);
+ fsb_length = XFS_B_TO_FSB(mp, XFS_MAXIOFFSET(mp)) - fsb_offset;
+ elem = 0;
+
+ if (fsb_length)
+ bmp = kmem_alloc(bmpsz, KM_SLEEP);
+
+ while (fsb_length && elem < nelem) {
+ dm_extent_t extent;
+ xfs_filblks_t fsb_bias;
+ dm_size_t bias;
+ int lock;
+ int num;
+ int i;
+
+ /* Compute how many getbmap structures to use on the xfs_bmapi
+ call.
+ */
+
+ num = MIN((u_int)(nelem - elem), bmpcnt);
+
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ lock = xfs_ilock_map_shared(ip);
+
+ error = xfs_bmapi(NULL, ip, fsb_offset, fsb_length,
+ XFS_BMAPI_ENTIRE, NULL, 0, bmp, &num, NULL, NULL);
+
+ xfs_iunlock_map_shared(ip, lock);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ if (error) {
+ error = -error; /* Return negative error to DMAPI */
+ goto finish_out;
+ }
+
+ /* Fill in the caller's extents, adjusting the bias in the
+ first entry if necessary.
+ */
+
+ for (i = 0; i < num; i++, extentp++) {
+ bias = startoff - XFS_FSB_TO_B(mp, bmp[i].br_startoff);
+ extent.ex_offset = startoff;
+ extent.ex_length =
+ XFS_FSB_TO_B(mp, bmp[i].br_blockcount) - bias;
+ if (bmp[i].br_startblock == HOLESTARTBLOCK) {
+ extent.ex_type = DM_EXTENT_HOLE;
+ } else {
+ extent.ex_type = DM_EXTENT_RES;
+ }
+ startoff = extent.ex_offset + extent.ex_length;
+
+ if (copy_to_user( extentp, &extent, sizeof(extent))) {
+ error = -EFAULT;
+ goto finish_out;
+ }
+
+ fsb_bias = fsb_offset - bmp[i].br_startoff;
+ fsb_offset += bmp[i].br_blockcount - fsb_bias;
+ fsb_length -= bmp[i].br_blockcount - fsb_bias;
+ elem++;
+ }
+ }
+
+ if (fsb_length == 0) {
+ startoff = 0;
+ }
+ if (copy_to_user( offp, &startoff, sizeof(startoff))) {
+ error = -EFAULT;
+ goto finish_out;
+ }
+
+ if (copy_to_user( nelemp, &elem, sizeof(elem))) {
+ error = -EFAULT;
+ goto finish_out;
+ }
+
+ *rvp = (fsb_length == 0 ? 0 : 1);
+
+finish_out:
+ if (bmp)
+ kmem_free(bmp);
+ return(error);
+}
+
+
+STATIC int
+xfs_dm_zero_xstatinfo_link(
+ dm_xstat_t __user *dxs)
+{
+ dm_xstat_t *ldxs;
+ int error = 0;
+
+ if (!dxs)
+ return 0;
+ ldxs = kmalloc(sizeof(*ldxs), GFP_KERNEL);
+ if (!ldxs)
+ return -ENOMEM;
+ if (copy_from_user(ldxs, dxs, sizeof(*dxs))) {
+ error = -EFAULT;
+ } else {
+ ldxs->dx_statinfo._link = 0;
+ if (copy_to_user(dxs, ldxs, sizeof(*dxs)))
+ error = -EFAULT;
+ }
+ kfree(ldxs);
+ return error;
+}
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_bulkall_rvp(
+ struct inode *inode,
+ dm_right_t right,
+ u_int mask,
+ dm_attrname_t __user *attrnamep,
+ dm_attrloc_t __user *locp,
+ size_t buflen,
+ void __user *bufp, /* address of buffer in user space */
+ size_t __user *rlenp, /* user space address */
+ int *rvalp)
+{
+ int error, done;
+ int nelems;
+ u_int statstruct_sz;
+ dm_attrloc_t loc;
+ xfs_mount_t *mp = XFS_I(inode)->i_mount;
+ dm_attrname_t attrname;
+ dm_bulkstat_one_t dmb;
+
+ /* Returns negative errors to DMAPI */
+
+ if (copy_from_user(&attrname, attrnamep, sizeof(attrname)) ||
+ copy_from_user(&loc, locp, sizeof(loc)))
+ return -EFAULT;
+
+ if (attrname.an_chars[0] == '\0')
+ return(-EINVAL);
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ /* Because we will write directly to the user's buffer, make sure that
+ the buffer is properly aligned.
+ */
+
+ if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
+ return(-EFAULT);
+
+ /* Size of the handle is constant for this function.
+ * If there are no files with attributes, then this will be the
+ * maximum number of inodes we can get.
+ */
+
+ statstruct_sz = DM_STAT_SIZE(dm_xstat_t, 0);
+ statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+
+ nelems = buflen / statstruct_sz;
+ if (nelems < 1) {
+ if (put_user( statstruct_sz, rlenp ))
+ return(-EFAULT);
+ return(-E2BIG);
+ }
+
+ /* Build the on-disk version of the attribute name. */
+ strcpy(dmb.attrname.dan_chars, dmattr_prefix);
+ strncpy(&dmb.attrname.dan_chars[DMATTR_PREFIXLEN],
+ attrname.an_chars, DM_ATTR_NAME_SIZE + 1);
+ dmb.attrname.dan_chars[sizeof(dmb.attrname.dan_chars) - 1] = '\0';
+
+ /*
+ * fill the buffer with dm_xstat_t's
+ */
+
+ dmb.laststruct = NULL;
+ memcpy(&dmb.fsid, mp->m_fixedfsid, sizeof(dm_fsid_t));
+ error = xfs_bulkstat(mp, (xfs_ino_t *)&loc, &nelems,
+ xfs_dm_bulkall_one, (void*)&dmb, statstruct_sz,
+ bufp, BULKSTAT_FG_INLINE, &done);
+ if (error)
+ return(-error); /* Return negative error to DMAPI */
+
+ *rvalp = !done ? 1 : 0;
+
+ if (put_user( statstruct_sz * nelems, rlenp ))
+ return(-EFAULT);
+
+ if (copy_to_user( locp, &loc, sizeof(loc)))
+ return(-EFAULT);
+ /*
+ * If we didn't do any, we must not have any more to do.
+ */
+ if (nelems < 1)
+ return(0);
+ /*
+ * Set _link in the last struct to zero
+ */
+ return xfs_dm_zero_xstatinfo_link((dm_xstat_t __user *)dmb.laststruct);
+}
+
+
+STATIC int
+xfs_dm_zero_statinfo_link(
+ dm_stat_t __user *dxs)
+{
+ dm_stat_t *ldxs;
+ int error = 0;
+
+ if (!dxs)
+ return 0;
+ ldxs = kmalloc(sizeof(*ldxs), GFP_KERNEL);
+ if (!ldxs)
+ return -ENOMEM;
+ if (copy_from_user(ldxs, dxs, sizeof(*dxs))) {
+ error = -EFAULT;
+ } else {
+ ldxs->_link = 0;
+ if (copy_to_user(dxs, ldxs, sizeof(*dxs)))
+ error = -EFAULT;
+ }
+ kfree(ldxs);
+ return error;
+}
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_bulkattr_rvp(
+ struct inode *inode,
+ dm_right_t right,
+ u_int mask,
+ dm_attrloc_t __user *locp,
+ size_t buflen,
+ void __user *bufp,
+ size_t __user *rlenp,
+ int *rvalp)
+{
+ int error, done;
+ int nelems;
+ u_int statstruct_sz;
+ dm_attrloc_t loc;
+ xfs_mount_t *mp = XFS_I(inode)->i_mount;
+ dm_bulkstat_one_t dmb;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ if (copy_from_user( &loc, locp, sizeof(loc)))
+ return(-EFAULT);
+
+ /* Because we will write directly to the user's buffer, make sure that
+ the buffer is properly aligned.
+ */
+
+ if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
+ return(-EFAULT);
+
+ /* size of the handle is constant for this function */
+
+ statstruct_sz = DM_STAT_SIZE(dm_stat_t, 0);
+ statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+
+ nelems = buflen / statstruct_sz;
+ if (nelems < 1) {
+ if (put_user( statstruct_sz, rlenp ))
+ return(-EFAULT);
+ return(-E2BIG);
+ }
+
+ dmb.laststruct = NULL;
+ memcpy(&dmb.fsid, mp->m_fixedfsid, sizeof(dm_fsid_t));
+ error = xfs_bulkstat(mp, (xfs_ino_t *)&loc, &nelems,
+ xfs_dm_bulkattr_one, (void*)&dmb,
+ statstruct_sz, bufp, BULKSTAT_FG_INLINE, &done);
+ if (error)
+ return(-error); /* Return negative error to DMAPI */
+
+ *rvalp = !done ? 1 : 0;
+
+ if (put_user( statstruct_sz * nelems, rlenp ))
+ return(-EFAULT);
+
+ if (copy_to_user( locp, &loc, sizeof(loc)))
+ return(-EFAULT);
+
+ /*
+ * If we didn't do any, we must not have any more to do.
+ */
+ if (nelems < 1)
+ return(0);
+ /*
+ * Set _link in the last struct to zero
+ */
+ return xfs_dm_zero_statinfo_link((dm_stat_t __user *)dmb.laststruct);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_config(
+ struct inode *inode,
+ dm_right_t right,
+ dm_config_t flagname,
+ dm_size_t __user *retvalp)
+{
+ dm_size_t retval;
+
+ /* Returns negative errors to DMAPI */
+
+ switch (flagname) {
+ case DM_CONFIG_DTIME_OVERLOAD:
+ case DM_CONFIG_PERS_ATTRIBUTES:
+ case DM_CONFIG_PERS_EVENTS:
+ case DM_CONFIG_PERS_MANAGED_REGIONS:
+ case DM_CONFIG_PUNCH_HOLE:
+ case DM_CONFIG_WILL_RETRY:
+ retval = DM_TRUE;
+ break;
+
+ case DM_CONFIG_CREATE_BY_HANDLE: /* these will never be done */
+ case DM_CONFIG_LOCK_UPGRADE:
+ case DM_CONFIG_PERS_INHERIT_ATTRIBS:
+ retval = DM_FALSE;
+ break;
+
+ case DM_CONFIG_BULKALL:
+ retval = DM_TRUE;
+ break;
+ case DM_CONFIG_MAX_ATTR_ON_DESTROY:
+ retval = DM_MAX_ATTR_BYTES_ON_DESTROY;
+ break;
+
+ case DM_CONFIG_MAX_ATTRIBUTE_SIZE:
+ retval = ATTR_MAX_VALUELEN;
+ break;
+
+ case DM_CONFIG_MAX_HANDLE_SIZE:
+ retval = DM_MAX_HANDLE_SIZE;
+ break;
+
+ case DM_CONFIG_MAX_MANAGED_REGIONS:
+ retval = 1;
+ break;
+
+ case DM_CONFIG_TOTAL_ATTRIBUTE_SPACE:
+ retval = 0x7fffffff; /* actually it's unlimited */
+ break;
+
+ default:
+ return(-EINVAL);
+ }
+
+ /* Copy the results back to the user. */
+
+ if (copy_to_user( retvalp, &retval, sizeof(retval)))
+ return(-EFAULT);
+ return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_config_events(
+ struct inode *inode,
+ dm_right_t right,
+ u_int nelem,
+ dm_eventset_t __user *eventsetp,
+ u_int __user *nelemp)
+{
+ dm_eventset_t eventset;
+
+ /* Returns negative errors to DMAPI */
+
+ if (nelem == 0)
+ return(-EINVAL);
+
+ eventset = DM_XFS_SUPPORTED_EVENTS;
+
+ /* Now copy the event mask and event count back to the caller. We
+ return the lesser of nelem and DM_EVENT_MAX.
+ */
+
+ if (nelem > DM_EVENT_MAX)
+ nelem = DM_EVENT_MAX;
+ eventset &= (1 << nelem) - 1;
+
+ if (copy_to_user( eventsetp, &eventset, sizeof(eventset)))
+ return(-EFAULT);
+
+ if (put_user(nelem, nelemp))
+ return(-EFAULT);
+ return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_destroy_dmattr(
+ struct inode *inode,
+ dm_right_t right,
+ dm_attrname_t *attrnamep,
+ char **valuepp,
+ int *vlenp)
+{
+ dm_dkattrname_t dkattrname;
+ int alloc_size;
+ int value_len;
+ char *value;
+ int error;
+
+ /* Returns negative errors to DMAPI */
+
+ *vlenp = -1; /* assume failure by default */
+
+ if (attrnamep->an_chars[0] == '\0')
+ return(-EINVAL);
+
+ /* Build the on-disk version of the attribute name. */
+
+ strcpy(dkattrname.dan_chars, dmattr_prefix);
+ strncpy(&dkattrname.dan_chars[DMATTR_PREFIXLEN],
+ (char *)attrnamep->an_chars, DM_ATTR_NAME_SIZE + 1);
+ dkattrname.dan_chars[sizeof(dkattrname.dan_chars) - 1] = '\0';
+
+ /* xfs_attr_get will not return anything if the buffer is too small,
+ and we don't know how big to make the buffer, so this may take
+ two tries to get it right. The initial try must use a buffer of
+ at least XFS_BUG_KLUDGE bytes to prevent buffer overflow because
+ of a bug in XFS.
+ */
+
+ alloc_size = XFS_BUG_KLUDGE;
+ value = kmalloc(alloc_size, GFP_KERNEL);
+ if (value == NULL)
+ return(-ENOMEM);
+
+ error = xfs_attr_get(XFS_I(inode), dkattrname.dan_chars, value,
+ &value_len, ATTR_ROOT);
+ if (error == ERANGE) {
+ kfree(value);
+ alloc_size = value_len;
+ value = kmalloc(alloc_size, GFP_KERNEL);
+ if (value == NULL)
+ return(-ENOMEM);
+
+ error = xfs_attr_get(XFS_I(inode), dkattrname.dan_chars, value,
+ &value_len, ATTR_ROOT);
+ }
+ if (error) {
+ kfree(value);
+ DM_EA_XLATE_ERR(error);
+ return(-error); /* Return negative error to DMAPI */
+ }
+
+ /* The attribute exists and has a value. Note that a value_len of
+ zero is valid!
+ */
+
+ if (value_len == 0) {
+ kfree(value);
+ *vlenp = 0;
+ return(0);
+ } else if (value_len > DM_MAX_ATTR_BYTES_ON_DESTROY) {
+ char *value2;
+
+ value2 = kmalloc(DM_MAX_ATTR_BYTES_ON_DESTROY, GFP_KERNEL);
+ if (value2 == NULL) {
+ kfree(value);
+ return(-ENOMEM);
+ }
+ memcpy(value2, value, DM_MAX_ATTR_BYTES_ON_DESTROY);
+ kfree(value);
+ value = value2;
+ value_len = DM_MAX_ATTR_BYTES_ON_DESTROY;
+ }
+ *vlenp = value_len;
+ *valuepp = value;
+ return(0);
+}
+
+/* This code was taken from xfs_fcntl(F_DIOINFO) and modified slightly because
+ we don't have a flags parameter (no open file).
+ Taken from xfs_ioctl(XFS_IOC_DIOINFO) on Linux.
+*/
+
+STATIC int
+xfs_dm_get_dioinfo(
+ struct inode *inode,
+ dm_right_t right,
+ dm_dioinfo_t __user *diop)
+{
+ dm_dioinfo_t dio;
+ xfs_mount_t *mp;
+ xfs_inode_t *ip = XFS_I(inode);
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ mp = ip->i_mount;
+
+ dio.d_miniosz = dio.d_mem = MIN_DIO_SIZE(mp);
+ dio.d_maxiosz = MAX_DIO_SIZE(mp);
+ dio.d_dio_only = DM_FALSE;
+
+ if (copy_to_user(diop, &dio, sizeof(dio)))
+ return(-EFAULT);
+ return(0);
+}
+
+typedef struct dm_readdir_cb {
+ xfs_mount_t *mp;
+ char __user *ubuf;
+ dm_stat_t __user *lastbuf;
+ size_t spaceleft;
+ size_t nwritten;
+ int error;
+ dm_stat_t kstat;
+} dm_readdir_cb_t;
+
+STATIC int
+dm_filldir(void *__buf, const char *name, int namelen, loff_t offset,
+ u64 ino, unsigned int d_type)
+{
+ dm_readdir_cb_t *cb = __buf;
+ dm_stat_t *statp = &cb->kstat;
+ size_t len;
+ int error;
+ int needed;
+
+ /*
+ * Make sure we have enough space.
+ */
+ needed = dm_stat_size(namelen + 1);
+ if (cb->spaceleft < needed) {
+ cb->spaceleft = 0;
+ return -ENOSPC;
+ }
+
+ error = -EINVAL;
+ if (xfs_internal_inum(cb->mp, ino))
+ goto out_err;
+
+ memset(statp, 0, dm_stat_size(MAXNAMLEN));
+ error = -xfs_dm_bulkattr_iget_one(cb->mp, ino, 0,
+ statp, needed);
+ if (error)
+ goto out_err;
+
+ /*
+ * On return from bulkstat_one(), stap->_link points
+ * at the end of the handle in the stat structure.
+ */
+ statp->dt_compname.vd_offset = statp->_link;
+ statp->dt_compname.vd_length = namelen + 1;
+
+ len = statp->_link;
+
+ /* Word-align the record */
+ statp->_link = dm_stat_align(len + namelen + 1);
+
+ error = -EFAULT;
+ if (copy_to_user(cb->ubuf, statp, len))
+ goto out_err;
+ if (copy_to_user(cb->ubuf + len, name, namelen))
+ goto out_err;
+ if (put_user(0, cb->ubuf + len + namelen))
+ goto out_err;
+
+ cb->lastbuf = (dm_stat_t __user *)cb->ubuf;
+ cb->spaceleft -= statp->_link;
+ cb->nwritten += statp->_link;
+ cb->ubuf += statp->_link;
+
+ return 0;
+
+ out_err:
+ cb->error = error;
+ return error;
+}
+
+/* Returns negative errors to DMAPI */
+STATIC int
+xfs_dm_get_dirattrs_rvp(
+ struct inode *inode,
+ dm_right_t right,
+ u_int mask,
+ dm_attrloc_t __user *locp,
+ size_t buflen,
+ void __user *bufp,
+ size_t __user *rlenp,
+ int *rvp)
+{
+ xfs_inode_t *dp = XFS_I(inode);
+ xfs_mount_t *mp = dp->i_mount;
+ dm_readdir_cb_t *cb;
+ dm_attrloc_t loc;
+ int error;
+
+ if (right < DM_RIGHT_SHARED)
+ return -EACCES;
+
+ /*
+ * Make sure that the buffer is properly aligned.
+ */
+ if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
+ return -EFAULT;
+
+ if (mask & ~(DM_AT_HANDLE|DM_AT_EMASK|DM_AT_PMANR|DM_AT_PATTR|
+ DM_AT_DTIME|DM_AT_CFLAG|DM_AT_STAT))
+ return -EINVAL;
+
+ if (!S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ /*
+ * bufp should be able to fit at least one dm_stat entry including
+ * dt_handle and full size MAXNAMLEN dt_compname.
+ */
+ if (buflen < dm_stat_size(MAXNAMLEN))
+ return -ENOMEM;
+
+ if (copy_from_user(&loc, locp, sizeof(loc)))
+ return -EFAULT;
+
+ cb = kzalloc(sizeof(*cb) + dm_stat_size(MAXNAMLEN), GFP_KERNEL);
+ if (!cb)
+ return -ENOMEM;
+
+ cb->mp = mp;
+ cb->spaceleft = buflen;
+ cb->ubuf = bufp;
+
+ mutex_lock(&inode->i_mutex);
+ error = -ENOENT;
+ if (!IS_DEADDIR(inode)) {
+ error = -xfs_readdir(dp, cb, dp->i_size,
+ (xfs_off_t *)&loc, dm_filldir);
+ }
+ mutex_unlock(&inode->i_mutex);
+
+ if (error)
+ goto out_kfree;
+ if (cb->error) {
+ error = cb->error;
+ goto out_kfree;
+ }
+
+ error = -EFAULT;
+ if (cb->lastbuf && put_user(0, &cb->lastbuf->_link))
+ goto out_kfree;
+ if (put_user(cb->nwritten, rlenp))
+ goto out_kfree;
+ if (copy_to_user(locp, &loc, sizeof(loc)))
+ goto out_kfree;
+
+ if (cb->nwritten)
+ *rvp = 1;
+ else
+ *rvp = 0;
+ error = 0;
+
+ out_kfree:
+ kfree(cb);
+ return error;
+}
+
+STATIC int
+xfs_dm_get_dmattr(
+ struct inode *inode,
+ dm_right_t right,
+ dm_attrname_t __user *attrnamep,
+ size_t buflen,
+ void __user *bufp,
+ size_t __user *rlenp)
+{
+ dm_dkattrname_t name;
+ char *value;
+ int value_len;
+ int alloc_size;
+ int error;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+ return(-error); /* Return negative error to DMAPI */
+
+ /* Allocate a buffer to receive the attribute's value. We allocate
+ at least one byte even if the caller specified a buflen of zero.
+ (A buflen of zero is considered valid.)
+
+ Allocating a minimum of XFS_BUG_KLUDGE bytes temporarily works
+ around a bug within XFS in which in-inode attribute values are not
+ checked to see if they will fit in the buffer before they are
+ copied. Since no in-core attribute value can be larger than 256
+ bytes (an 8-bit size field), we allocate that minimum size here to
+ prevent buffer overrun in both the kernel's and user's buffers.
+ */
+
+ alloc_size = buflen;
+ if (alloc_size < XFS_BUG_KLUDGE)
+ alloc_size = XFS_BUG_KLUDGE;
+ if (alloc_size > ATTR_MAX_VALUELEN)
+ alloc_size = ATTR_MAX_VALUELEN;
- value = kmem_alloc(alloc_size, KM_SLEEP | KM_LARGE);
++ value = kmem_zalloc_large(alloc_size);
+
+ /* Get the attribute's value. */
+
+ value_len = alloc_size; /* in/out parameter */
+
+ error = xfs_attr_get(XFS_I(inode), name.dan_chars, value, &value_len,
+ ATTR_ROOT);
+ DM_EA_XLATE_ERR(error);
+
+ /* DMAPI requires an errno of ENOENT if an attribute does not exist,
+ so remap ENOATTR here.
+ */
+
+ if (error == ENOATTR)
+ error = ENOENT;
+ if (!error && value_len > buflen)
+ error = E2BIG;
+ if (!error && copy_to_user(bufp, value, value_len))
+ error = EFAULT;
+ if (!error || error == E2BIG) {
+ if (put_user(value_len, rlenp))
+ error = EFAULT;
+ }
+
+ kmem_free(value);
+ return(-error); /* Return negative error to DMAPI */
+}
+
+STATIC int
+xfs_dm_get_eventlist(
+ struct inode *inode,
+ dm_right_t right,
+ u_int type,
+ u_int nelem,
+ dm_eventset_t *eventsetp,
+ u_int *nelemp)
+{
+ int error;
+ xfs_inode_t *ip = XFS_I(inode);
+
+ /* Returns negative errors to DMAPI */
+
+ if (type == DM_FSYS_OBJ) {
+ error = xfs_dm_fs_get_eventlist(ip->i_mount, right, nelem,
+ eventsetp, nelemp);
+ } else {
+ error = xfs_dm_f_get_eventlist(ip, right, nelem,
+ eventsetp, nelemp);
+ }
+ return(-error); /* Returns negative error to DMAPI */
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_get_fileattr(
+ struct inode *inode,
+ dm_right_t right,
+ u_int mask, /* not used; always return everything */
+ dm_stat_t __user *statp)
+{
+ dm_stat_t stat;
+ xfs_inode_t *ip = XFS_I(inode);
+ xfs_mount_t *mp;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ /* Find the mount point. */
+
+ mp = ip->i_mount;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ xfs_ip_to_stat(mp, ip->i_ino, ip, &stat);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (copy_to_user( statp, &stat, sizeof(stat)))
+ return(-EFAULT);
+ return(0);
+}
+
+
+/* We currently only support a maximum of one managed region per file, and
+ use the DM_EVENT_READ, DM_EVENT_WRITE, and DM_EVENT_TRUNCATE events in
+ the file's dm_eventset_t event mask to implement the DM_REGION_READ,
+ DM_REGION_WRITE, and DM_REGION_TRUNCATE flags for that single region.
+*/
+
+STATIC int
+xfs_dm_get_region(
+ struct inode *inode,
+ dm_right_t right,
+ u_int nelem,
+ dm_region_t __user *regbufp,
+ u_int __user *nelemp)
+{
+ dm_eventset_t evmask;
+ dm_region_t region;
+ xfs_inode_t *ip = XFS_I(inode);
+ u_int elem;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ evmask = ip->i_d.di_dmevmask; /* read the mask "atomically" */
+
+ /* Get the file's current managed region flags out of the
+ dm_eventset_t mask and use them to build a managed region that
+ covers the entire file, i.e. set rg_offset and rg_size to zero.
+ */
+
+ memset((char *)®ion, 0, sizeof(region));
+
+ if (evmask & (1 << DM_EVENT_READ))
+ region.rg_flags |= DM_REGION_READ;
+ if (evmask & (1 << DM_EVENT_WRITE))
+ region.rg_flags |= DM_REGION_WRITE;
+ if (evmask & (1 << DM_EVENT_TRUNCATE))
+ region.rg_flags |= DM_REGION_TRUNCATE;
+
+ elem = (region.rg_flags ? 1 : 0);
+
+ if (copy_to_user( nelemp, &elem, sizeof(elem)))
+ return(-EFAULT);
+ if (elem > nelem)
+ return(-E2BIG);
+ if (elem && copy_to_user(regbufp, ®ion, sizeof(region)))
+ return(-EFAULT);
+ return(0);
+}
+
+
+STATIC int
+xfs_dm_getall_dmattr(
+ struct inode *inode,
+ dm_right_t right,
+ size_t buflen,
+ void __user *bufp,
+ size_t __user *rlenp)
+{
+ attrlist_cursor_kern_t cursor;
+ attrlist_t *attrlist;
+ dm_attrlist_t __user *ulist;
+ int *last_link;
+ int alignment;
+ int total_size;
+ int list_size = 8192; /* should be big enough */
+ int error;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ /* Verify that the user gave us a buffer that is 4-byte aligned, lock
+ it down, and work directly within that buffer. As a side-effect,
+ values of buflen < sizeof(int) return EINVAL.
+ */
+
+ alignment = sizeof(int) - 1;
+ if ((((__psint_t)bufp & alignment) != 0) ||
+ !access_ok(VERIFY_WRITE, bufp, buflen)) {
+ return(-EFAULT);
+ }
+ buflen &= ~alignment; /* round down the alignment */
+
+ /* Initialize all the structures and variables for the main loop. */
+
+ memset(&cursor, 0, sizeof(cursor));
+ attrlist = (attrlist_t *)kmem_alloc(list_size, KM_SLEEP);
+ total_size = 0;
+ ulist = (dm_attrlist_t *)bufp;
+ last_link = NULL;
+
+ /* Use vop_attr_list to get the names of DMAPI attributes, and use
+ vop_attr_get to get their values. There is a risk here that the
+ DMAPI attributes could change between the vop_attr_list and
+ vop_attr_get calls. If we can detect it, we return EIO to notify
+ the user.
+ */
+
+ do {
+ int i;
+
+ /* Get a buffer full of attribute names. If there aren't any
+ more or if we encounter an error, then finish up.
+ */
+
+ error = xfs_attr_list(XFS_I(inode), (char *)attrlist, list_size,
+ ATTR_ROOT, &cursor);
+ DM_EA_XLATE_ERR(error);
+
+ if (error || attrlist->al_count == 0)
+ break;
+
+ for (i = 0; i < attrlist->al_count; i++) {
+ attrlist_ent_t *entry;
+ char *user_name;
+ int size_needed;
+ int value_len;
+
+ /* Skip over all non-DMAPI attributes. If the
+ attribute name is too long, we assume it is
+ non-DMAPI even if it starts with the correct
+ prefix.
+ */
+
+ entry = ATTR_ENTRY(attrlist, i);
+ if (strncmp(entry->a_name, dmattr_prefix, DMATTR_PREFIXLEN))
+ continue;
+ user_name = &entry->a_name[DMATTR_PREFIXLEN];
+ if (strlen(user_name) > DM_ATTR_NAME_SIZE)
+ continue;
+
+ /* We have a valid DMAPI attribute to return. If it
+ won't fit in the user's buffer, we still need to
+ keep track of the number of bytes for the user's
+ next call.
+ */
+
+
+ size_needed = sizeof(*ulist) + entry->a_valuelen;
+ size_needed = (size_needed + alignment) & ~alignment;
+
+ total_size += size_needed;
+ if (total_size > buflen)
+ continue;
+
+ /* Start by filling in all the fields in the
+ dm_attrlist_t structure.
+ */
+
+ strncpy((char *)ulist->al_name.an_chars, user_name,
+ DM_ATTR_NAME_SIZE);
+ ulist->al_data.vd_offset = sizeof(*ulist);
+ ulist->al_data.vd_length = entry->a_valuelen;
+ ulist->_link = size_needed;
+ last_link = &ulist->_link;
+
+ /* Next read the attribute's value into its correct
+ location after the dm_attrlist structure. Any sort
+ of error indicates that the data is moving under us,
+ so we return EIO to let the user know.
+ */
+
+ value_len = entry->a_valuelen;
+
+ error = xfs_attr_get(XFS_I(inode), entry->a_name,
+ (void *)(ulist + 1), &value_len,
+ ATTR_ROOT);
+ DM_EA_XLATE_ERR(error);
+
+ if (error || value_len != entry->a_valuelen) {
+ error = EIO;
+ break;
+ }
+
+ ulist = (dm_attrlist_t *)((char *)ulist + ulist->_link);
+ }
+ } while (!error && attrlist->al_more);
+ if (last_link)
+ *last_link = 0;
+
+ if (!error && total_size > buflen)
+ error = E2BIG;
+ if (!error || error == E2BIG) {
+ if (put_user(total_size, rlenp))
+ error = EFAULT;
+ }
+
+ kmem_free(attrlist);
+ return(-error); /* Return negative error to DMAPI */
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_getall_inherit(
+ struct inode *inode,
+ dm_right_t right,
+ u_int nelem,
+ dm_inherit_t __user *inheritbufp,
+ u_int __user *nelemp)
+{
+ return(-ENOSYS); /* Return negative error to DMAPI */
+}
+
+
+/* Initialize location pointer for subsequent dm_get_dirattrs,
+ dm_get_bulkattr, and dm_get_bulkall calls. The same initialization must
+ work for inode-based routines (dm_get_dirattrs) and filesystem-based
+ routines (dm_get_bulkattr and dm_get_bulkall). Filesystem-based functions
+ call this routine using the filesystem's root inode.
+*/
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_init_attrloc(
+ struct inode *inode,
+ dm_right_t right,
+ dm_attrloc_t __user *locp)
+{
+ dm_attrloc_t loc = 0;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ if (copy_to_user( locp, &loc, sizeof(loc)))
+ return(-EFAULT);
+ return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_mkdir_by_handle(
+ struct inode *inode,
+ dm_right_t right,
+ void __user *hanp,
+ size_t hlen,
+ char __user *cname)
+{
+ return(-ENOSYS); /* Return negative error to DMAPI */
+}
+
+
+/*
+ * Probe and Punch
+ *
+ * Hole punching alignment is based on the underlying device base
+ * allocation size. Because it is not defined in the DMAPI spec, we
+ * can align how we choose here. Round inwards (offset up and length
+ * down) to the block, extent or page size whichever is bigger. Our
+ * DMAPI implementation rounds the hole geometry strictly inwards. If
+ * this is not possible, return EINVAL for both for xfs_dm_probe_hole
+ * and xfs_dm_punch_hole which differs from the DMAPI spec. Note that
+ * length = 0 is special - it means "punch to EOF" and at that point
+ * we treat the punch as remove everything past offset (including
+ * preallocation past EOF).
+ */
+
+STATIC int
+xfs_dm_round_hole(
+ dm_off_t offset,
+ dm_size_t length,
+ dm_size_t align,
+ xfs_fsize_t filesize,
+ dm_off_t *roff,
+ dm_size_t *rlen)
+{
+
+ dm_off_t off = offset;
+ dm_size_t len = length;
+
+ /* Try to round offset up to the nearest boundary */
+ *roff = roundup_64(off, align);
+ if ((*roff >= filesize) || (len && (len < align)))
+ return -EINVAL;
+
+ if ((len == 0) || ((off + len) == filesize)) {
+ /* punch to EOF */
+ *rlen = 0;
+ } else {
+ /* Round length down to the nearest boundary. */
+ ASSERT(len >= align);
+ ASSERT(align > (*roff - off));
+ len -= *roff - off;
+ *rlen = len - do_mod(len, align);
+ if (*rlen == 0)
+ return -EINVAL; /* requested length is too small */
+ }
+#ifdef CONFIG_DMAPI_DEBUG
+ printk("xfs_dm_round_hole: off %lu, len %ld, align %lu, "
+ "filesize %llu, roff %ld, rlen %ld\n",
+ offset, length, align, filesize, *roff, *rlen);
+#endif
+ return 0; /* hole geometry successfully rounded */
+}
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_probe_hole(
+ struct inode *inode,
+ dm_right_t right,
+ dm_off_t off,
+ dm_size_t len,
+ dm_off_t __user *roffp,
+ dm_size_t __user *rlenp)
+{
+ dm_off_t roff;
+ dm_size_t rlen;
+ xfs_inode_t *ip = XFS_I(inode);
+ xfs_mount_t *mp;
+ uint lock_flags;
+ xfs_fsize_t realsize;
+ dm_size_t align;
+ int error;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return -EACCES;
+
+ if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
+ return -EINVAL;
+
+ mp = ip->i_mount;
+ lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+ xfs_ilock(ip, lock_flags);
+ realsize = ip->i_size;
+ xfs_iunlock(ip, lock_flags);
+
+ if ((off + len) > realsize)
+ return -E2BIG;
+
+ align = 1 << mp->m_sb.sb_blocklog;
+
+ error = xfs_dm_round_hole(off, len, align, realsize, &roff, &rlen);
+ if (error)
+ return error;
+
+ if (copy_to_user( roffp, &roff, sizeof(roff)))
+ return -EFAULT;
+ if (copy_to_user( rlenp, &rlen, sizeof(rlen)))
+ return -EFAULT;
+ return(0);
+}
+
+
+STATIC int
+xfs_dm_punch_hole(
+ struct inode *inode,
+ dm_right_t right,
+ dm_off_t off,
+ dm_size_t len)
+{
+ xfs_flock64_t bf;
+ int error = 0;
+ xfs_inode_t *ip = XFS_I(inode);
+ xfs_mount_t *mp;
+ dm_size_t align;
+ xfs_fsize_t realsize;
+ dm_off_t roff;
+ dm_size_t rlen;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_EXCL)
+ return -EACCES;
+
+ /* Make sure there are no leases. */
+ error = break_lease(inode, FMODE_WRITE);
+ if (error)
+ return -EBUSY;
+
+ error = get_write_access(inode);
+ if (error)
+ return -EBUSY;
+
+ mp = ip->i_mount;
+
+ down_rw_sems(inode, DM_SEM_FLAG_WR);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ realsize = ip->i_size;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ align = xfs_get_extsz_hint(ip);
+ if (align == 0)
+ align = 1;
+
+ align <<= mp->m_sb.sb_blocklog;
+
+ if ((off + len) > realsize) {
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ error = -E2BIG;
+ goto up_and_out;
+ }
+
+ if ((off + len) == realsize)
+ len = 0;
+
+ error = xfs_dm_round_hole(off, len, align, realsize, &roff, &rlen);
+ if (error || (off != roff) || (len != rlen)) {
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ error = -EINVAL;
+ goto up_and_out;
+ }
+
+ bf.l_type = 0;
+ bf.l_whence = 0;
+ bf.l_start = (xfs_off_t)off;
+ if (len) {
+ bf.l_len = len;
+ }
+ else {
+ /*
+ * When we are punching to EOF, we have to make sure we punch
+ * the last partial block that contains EOF. Round up
+ * the length to make sure we punch the block and not just
+ * zero it.
+ */
+ bf.l_len = roundup_64((realsize - off), mp->m_sb.sb_blocksize);
+ }
+
+#ifdef CONFIG_DMAPI_DEBUG
+ printk("xfs_dm_punch_hole: off %lu, len %ld, align %lu\n",
+ off, len, align);
+#endif
+
+ error = xfs_change_file_space(ip, XFS_IOC_UNRESVSP, &bf,
+ (xfs_off_t)off, XFS_ATTR_DMI|XFS_ATTR_NOLOCK);
+
+ /*
+ * if punching to end of file, kill any blocks past EOF that
+ * may have been (speculatively) preallocated. No point in
+ * leaving them around if we are migrating the file....
+ */
+ if (!error && (len == 0)) {
+ error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_HASLOCK);
+ }
+
+ /*
+ * negate the error for return here as core XFS functions return
+ * positive error numbers
+ */
+ if (error)
+ error = -error;
+
+ /* Let threads in send_data_event know we punched the file. */
+ ip->i_d.di_dmstate++;
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+up_and_out:
+ up_rw_sems(inode, DM_SEM_FLAG_WR);
+ put_write_access(inode);
+
+ return error;
+}
+
+
+STATIC int
+xfs_dm_read_invis_rvp(
+ struct inode *inode,
+ dm_right_t right,
+ dm_off_t off,
+ dm_size_t len,
+ void __user *bufp,
+ int *rvp)
+{
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_SHARED)
+ return(-EACCES);
+
+ return(-xfs_dm_rdwr(inode, 0, FMODE_READ, off, len, bufp, rvp));
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_release_right(
+ struct inode *inode,
+ dm_right_t right,
+ u_int type) /* DM_FSYS_OBJ or zero */
+{
+#ifdef DEBUG_RIGHTS
+ char buffer[sizeof(dm_handle_t) * 2 + 1];
+
+ if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ printf("dm_release_right: old %d type %d handle %s\n",
+ right, type, buffer);
+ } else {
+ printf("dm_release_right: old %d type %d handle "
+ " <INVALID>\n", right, type);
+ }
+#endif /* DEBUG_RIGHTS */
+ return(0);
+}
+
+
+STATIC int
+xfs_dm_remove_dmattr(
+ struct inode *inode,
+ dm_right_t right,
+ int setdtime,
+ dm_attrname_t __user *attrnamep)
+{
+ dm_dkattrname_t name;
+ int error;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_EXCL)
+ return(-EACCES);
+
+ if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+ return(-error); /* Return negative error to DMAPI */
+
+ /* Remove the attribute from the object. */
+
+ error = xfs_attr_remove(XFS_I(inode), name.dan_chars, setdtime ?
+ ATTR_ROOT : (ATTR_ROOT|ATTR_KERNOTIME));
+ DM_EA_XLATE_ERR(error);
+
+ if (error == ENOATTR)
+ error = ENOENT;
+ return(-error); /* Return negative error to DMAPI */
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_request_right(
+ struct inode *inode,
+ dm_right_t right,
+ u_int type, /* DM_FSYS_OBJ or zero */
+ u_int flags,
+ dm_right_t newright)
+{
+#ifdef DEBUG_RIGHTS
+ char buffer[sizeof(dm_handle_t) * 2 + 1];
+
+ if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ printf("dm_request_right: old %d new %d type %d flags 0x%x "
+ "handle %s\n", right, newright, type, flags, buffer);
+ } else {
+ printf("dm_request_right: old %d new %d type %d flags 0x%x "
+ "handle <INVALID>\n", right, newright, type, flags);
+ }
+#endif /* DEBUG_RIGHTS */
+ return(0);
+}
+
+
+STATIC int
+xfs_dm_set_dmattr(
+ struct inode *inode,
+ dm_right_t right,
+ dm_attrname_t __user *attrnamep,
+ int setdtime,
+ size_t buflen,
+ void __user *bufp)
+{
+ dm_dkattrname_t name;
+ char *value;
+ int alloc_size;
+ int error;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_EXCL)
+ return(-EACCES);
+
+ if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+ return(-error); /* Return negative error to DMAPI */
+ if (buflen > ATTR_MAX_VALUELEN)
+ return(-E2BIG);
+
+ /* Copy in the attribute's value and store the <name,value> pair in
+ the object. We allocate a buffer of at least one byte even if the
+ caller specified a buflen of zero. (A buflen of zero is considered
+ valid.)
+ */
+
+ alloc_size = (buflen == 0) ? 1 : buflen;
+ value = kmem_alloc(alloc_size, KM_SLEEP);
+ if (copy_from_user( value, bufp, buflen)) {
+ error = EFAULT;
+ } else {
+ error = xfs_attr_set(XFS_I(inode), name.dan_chars, value, buflen,
+ setdtime ? ATTR_ROOT :
+ (ATTR_ROOT|ATTR_KERNOTIME));
+ DM_EA_XLATE_ERR(error);
+ }
+ kmem_free(value);
+ return(-error); /* Return negative error to DMAPI */
+}
+
+STATIC int
+xfs_dm_set_eventlist(
+ struct inode *inode,
+ dm_right_t right,
+ u_int type,
+ dm_eventset_t *eventsetp, /* in kernel space! */
+ u_int maxevent)
+{
+ int error;
+ xfs_inode_t *ip = XFS_I(inode);
+
+ /* Returns negative errors to DMAPI */
+
+ if (type == DM_FSYS_OBJ) {
+ error = xfs_dm_fs_set_eventlist(ip->i_mount, right, eventsetp, maxevent);
+ } else {
+ error = xfs_dm_f_set_eventlist(ip, right, eventsetp, maxevent);
+ }
+ return(-error); /* Return negative error to DMAPI */
+}
+
+
+/*
+ * This turned out not XFS-specific, but leave it here with get_fileattr.
+ */
+
+STATIC int
+xfs_dm_set_fileattr(
+ struct inode *inode,
+ dm_right_t right,
+ u_int mask,
+ dm_fileattr_t __user *statp)
+{
+ dm_fileattr_t stat;
+ struct iattr iattr;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_EXCL)
+ return(-EACCES);
+
+ if (copy_from_user( &stat, statp, sizeof(stat)))
+ return(-EFAULT);
+
+ iattr.ia_valid = 0;
+
+ if (mask & DM_AT_MODE) {
+ iattr.ia_valid |= ATTR_MODE;
+ iattr.ia_mode = stat.fa_mode;
+ }
+ if (mask & DM_AT_UID) {
+ iattr.ia_valid |= ATTR_UID;
+ iattr.ia_uid = stat.fa_uid;
+ }
+ if (mask & DM_AT_GID) {
+ iattr.ia_valid |= ATTR_GID;
+ iattr.ia_gid = stat.fa_gid;
+ }
+ if (mask & DM_AT_ATIME) {
+ iattr.ia_valid |= ATTR_ATIME;
+ iattr.ia_atime.tv_sec = stat.fa_atime;
+ iattr.ia_atime.tv_nsec = 0;
+ inode->i_atime.tv_sec = stat.fa_atime;
+ }
+ if (mask & DM_AT_MTIME) {
+ iattr.ia_valid |= ATTR_MTIME;
+ iattr.ia_mtime.tv_sec = stat.fa_mtime;
+ iattr.ia_mtime.tv_nsec = 0;
+ }
+ if (mask & DM_AT_CTIME) {
+ iattr.ia_valid |= ATTR_CTIME;
+ iattr.ia_ctime.tv_sec = stat.fa_ctime;
+ iattr.ia_ctime.tv_nsec = 0;
+ }
+
+ /*
+ * DM_AT_DTIME only takes effect if DM_AT_CTIME is not specified. We
+ * overload ctime to also act as dtime, i.e. DM_CONFIG_DTIME_OVERLOAD.
+ */
+ if ((mask & DM_AT_DTIME) && !(mask & DM_AT_CTIME)) {
+ iattr.ia_valid |= ATTR_CTIME;
+ iattr.ia_ctime.tv_sec = stat.fa_dtime;
+ iattr.ia_ctime.tv_nsec = 0;
+ }
+ if (mask & DM_AT_SIZE) {
+ iattr.ia_valid |= ATTR_SIZE;
+ iattr.ia_size = stat.fa_size;
+ }
+
+ return -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_DMI);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_set_inherit(
+ struct inode *inode,
+ dm_right_t right,
+ dm_attrname_t __user *attrnamep,
+ mode_t mode)
+{
+ return(-ENOSYS); /* Return negative error to DMAPI */
+}
+
+
+STATIC int
+xfs_dm_set_region(
+ struct inode *inode,
+ dm_right_t right,
+ u_int nelem,
+ dm_region_t __user *regbufp,
+ dm_boolean_t __user *exactflagp)
+{
+ xfs_inode_t *ip = XFS_I(inode);
+ xfs_trans_t *tp;
+ xfs_mount_t *mp;
+ dm_region_t region;
+ dm_eventset_t new_mask;
+ dm_eventset_t mr_mask;
+ int error;
+ u_int exactflag;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_EXCL)
+ return(-EACCES);
+
+ /* If the caller gave us more than one dm_region_t structure, complain.
+ (He has to call dm_get_config() to find out what our limit is.)
+ */
+
+ if (nelem > 1)
+ return(-E2BIG);
+
+ /* If the user provided a dm_region_t structure, then copy it in,
+ validate it, and convert its flags to the corresponding bits in a
+ dm_set_eventlist() event mask. A call with zero regions is
+ equivalent to clearing all region flags.
+ */
+
+ new_mask = 0;
+ if (nelem == 1) {
+ if (copy_from_user( ®ion, regbufp, sizeof(region)))
+ return(-EFAULT);
+
+ if (region.rg_flags & ~(DM_REGION_READ|DM_REGION_WRITE|DM_REGION_TRUNCATE))
+ return(-EINVAL);
+ if (region.rg_flags & DM_REGION_READ)
+ new_mask |= 1 << DM_EVENT_READ;
+ if (region.rg_flags & DM_REGION_WRITE)
+ new_mask |= 1 << DM_EVENT_WRITE;
+ if (region.rg_flags & DM_REGION_TRUNCATE)
+ new_mask |= 1 << DM_EVENT_TRUNCATE;
+ }
+ mr_mask = (1 << DM_EVENT_READ) | (1 << DM_EVENT_WRITE) | (1 << DM_EVENT_TRUNCATE);
+
+ /* Get the file's existing event mask, clear the old managed region
+ bits, add in the new ones, and update the file's mask.
+ */
+
+ if (new_mask & prohibited_mr_events(inode->i_mapping)) {
+ /* If the change is simply to remove the READ
+ * bit, then that's always okay. Otherwise, it's busy.
+ */
+ dm_eventset_t m1;
+ m1 = ip->i_d.di_dmevmask & ((1 << DM_EVENT_WRITE) | (1 << DM_EVENT_TRUNCATE));
+ if (m1 != new_mask) {
+ return -EBUSY;
+ }
+ }
+
+ mp = ip->i_mount;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+ error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return(-error); /* Return negative error to DMAPI */
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ ip->i_d.di_dmevmask = (ip->i_d.di_dmevmask & ~mr_mask) | new_mask;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ igrab(inode);
+ xfs_trans_commit(tp, 0);
+
+ /* Return the proper value for *exactflagp depending upon whether or not
+ we "changed" the user's managed region. In other words, if the user
+ specified a non-zero value for either rg_offset or rg_size, we
+ round each of those values back to zero.
+ */
+
+ if (nelem && (region.rg_offset || region.rg_size)) {
+ exactflag = DM_FALSE; /* user region was changed */
+ } else {
+ exactflag = DM_TRUE; /* user region was unchanged */
+ }
+ if (copy_to_user( exactflagp, &exactflag, sizeof(exactflag)))
+ return(-EFAULT);
+ return(0);
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_symlink_by_handle(
+ struct inode *inode,
+ dm_right_t right,
+ void __user *hanp,
+ size_t hlen,
+ char __user *cname,
+ char __user *path)
+{
+ return(-ENOSYS); /* Return negative errors to DMAPI */
+}
+
+
+/*
+ * xfs_dm_sync_by_handle needs to do the same thing as sys_fsync()
+ */
+STATIC int
+xfs_dm_sync_by_handle(
+ struct inode *inode,
+ dm_right_t right)
+{
+ int err, ret;
+ xfs_inode_t *ip = XFS_I(inode);
+
+ /* Returns negative errors to DMAPI */
+ if (right < DM_RIGHT_EXCL)
+ return(-EACCES);
+
+ /* We need to protect against concurrent writers.. */
+ ret = filemap_fdatawrite(inode->i_mapping);
+ down_rw_sems(inode, DM_FLAGS_IMUX);
- err = -xfs_fsync(ip);
++ err = xfs_fsync(inode, 1);
+ if (!ret)
+ ret = err;
+ up_rw_sems(inode, DM_FLAGS_IMUX);
+ err = filemap_fdatawait(inode->i_mapping);
+ if (!ret)
+ ret = err;
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ return ret;
+}
+
+
+/* ARGSUSED */
+STATIC int
+xfs_dm_upgrade_right(
+ struct inode *inode,
+ dm_right_t right,
+ u_int type) /* DM_FSYS_OBJ or zero */
+{
+#ifdef DEBUG_RIGHTS
+ char buffer[sizeof(dm_handle_t) * 2 + 1];
+
+ if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ printf("dm_upgrade_right: old %d new %d type %d handle %s\n",
+ right, DM_RIGHT_EXCL, type, buffer);
+ } else {
+ printf("dm_upgrade_right: old %d new %d type %d handle "
+ "<INVALID>\n", right, DM_RIGHT_EXCL, type);
+ }
+#endif /* DEBUG_RIGHTS */
+ return(0);
+}
+
+
+STATIC int
+xfs_dm_write_invis_rvp(
+ struct inode *inode,
+ dm_right_t right,
+ int flags,
+ dm_off_t off,
+ dm_size_t len,
+ void __user *bufp,
+ int *rvp)
+{
+ int fflag = 0;
+
+ /* Returns negative errors to DMAPI */
+
+ if (right < DM_RIGHT_EXCL)
+ return(-EACCES);
+
+ if (flags & DM_WRITE_SYNC)
+ fflag |= O_SYNC;
+ return(-xfs_dm_rdwr(inode, fflag, FMODE_WRITE, off, len, bufp, rvp));
+}
+
+
+STATIC void
+xfs_dm_obj_ref_hold(
+ struct inode *inode)
+{
+ igrab(inode);
+}
+
+
+static fsys_function_vector_t xfs_fsys_vector[DM_FSYS_MAX];
+
+
+STATIC int
+xfs_dm_get_dmapiops(
+ struct super_block *sb,
+ void *addr)
+{
+ static int initialized = 0;
+ dm_fcntl_vector_t *vecrq;
+ fsys_function_vector_t *vecp;
+ int i = 0;
+
+ vecrq = (dm_fcntl_vector_t *)addr;
+ vecrq->count =
+ sizeof(xfs_fsys_vector) / sizeof(xfs_fsys_vector[0]);
+ vecrq->vecp = xfs_fsys_vector;
+ if (initialized)
+ return(0);
+ vecrq->code_level = DM_CLVL_XOPEN;
+ vecp = xfs_fsys_vector;
+
+ vecp[i].func_no = DM_FSYS_CLEAR_INHERIT;
+ vecp[i++].u_fc.clear_inherit = xfs_dm_clear_inherit;
+ vecp[i].func_no = DM_FSYS_CREATE_BY_HANDLE;
+ vecp[i++].u_fc.create_by_handle = xfs_dm_create_by_handle;
+ vecp[i].func_no = DM_FSYS_DOWNGRADE_RIGHT;
+ vecp[i++].u_fc.downgrade_right = xfs_dm_downgrade_right;
+ vecp[i].func_no = DM_FSYS_GET_ALLOCINFO_RVP;
+ vecp[i++].u_fc.get_allocinfo_rvp = xfs_dm_get_allocinfo_rvp;
+ vecp[i].func_no = DM_FSYS_GET_BULKALL_RVP;
+ vecp[i++].u_fc.get_bulkall_rvp = xfs_dm_get_bulkall_rvp;
+ vecp[i].func_no = DM_FSYS_GET_BULKATTR_RVP;
+ vecp[i++].u_fc.get_bulkattr_rvp = xfs_dm_get_bulkattr_rvp;
+ vecp[i].func_no = DM_FSYS_GET_CONFIG;
+ vecp[i++].u_fc.get_config = xfs_dm_get_config;
+ vecp[i].func_no = DM_FSYS_GET_CONFIG_EVENTS;
+ vecp[i++].u_fc.get_config_events = xfs_dm_get_config_events;
+ vecp[i].func_no = DM_FSYS_GET_DESTROY_DMATTR;
+ vecp[i++].u_fc.get_destroy_dmattr = xfs_dm_get_destroy_dmattr;
+ vecp[i].func_no = DM_FSYS_GET_DIOINFO;
+ vecp[i++].u_fc.get_dioinfo = xfs_dm_get_dioinfo;
+ vecp[i].func_no = DM_FSYS_GET_DIRATTRS_RVP;
+ vecp[i++].u_fc.get_dirattrs_rvp = xfs_dm_get_dirattrs_rvp;
+ vecp[i].func_no = DM_FSYS_GET_DMATTR;
+ vecp[i++].u_fc.get_dmattr = xfs_dm_get_dmattr;
+ vecp[i].func_no = DM_FSYS_GET_EVENTLIST;
+ vecp[i++].u_fc.get_eventlist = xfs_dm_get_eventlist;
+ vecp[i].func_no = DM_FSYS_GET_FILEATTR;
+ vecp[i++].u_fc.get_fileattr = xfs_dm_get_fileattr;
+ vecp[i].func_no = DM_FSYS_GET_REGION;
+ vecp[i++].u_fc.get_region = xfs_dm_get_region;
+ vecp[i].func_no = DM_FSYS_GETALL_DMATTR;
+ vecp[i++].u_fc.getall_dmattr = xfs_dm_getall_dmattr;
+ vecp[i].func_no = DM_FSYS_GETALL_INHERIT;
+ vecp[i++].u_fc.getall_inherit = xfs_dm_getall_inherit;
+ vecp[i].func_no = DM_FSYS_INIT_ATTRLOC;
+ vecp[i++].u_fc.init_attrloc = xfs_dm_init_attrloc;
+ vecp[i].func_no = DM_FSYS_MKDIR_BY_HANDLE;
+ vecp[i++].u_fc.mkdir_by_handle = xfs_dm_mkdir_by_handle;
+ vecp[i].func_no = DM_FSYS_PROBE_HOLE;
+ vecp[i++].u_fc.probe_hole = xfs_dm_probe_hole;
+ vecp[i].func_no = DM_FSYS_PUNCH_HOLE;
+ vecp[i++].u_fc.punch_hole = xfs_dm_punch_hole;
+ vecp[i].func_no = DM_FSYS_READ_INVIS_RVP;
+ vecp[i++].u_fc.read_invis_rvp = xfs_dm_read_invis_rvp;
+ vecp[i].func_no = DM_FSYS_RELEASE_RIGHT;
+ vecp[i++].u_fc.release_right = xfs_dm_release_right;
+ vecp[i].func_no = DM_FSYS_REMOVE_DMATTR;
+ vecp[i++].u_fc.remove_dmattr = xfs_dm_remove_dmattr;
+ vecp[i].func_no = DM_FSYS_REQUEST_RIGHT;
+ vecp[i++].u_fc.request_right = xfs_dm_request_right;
+ vecp[i].func_no = DM_FSYS_SET_DMATTR;
+ vecp[i++].u_fc.set_dmattr = xfs_dm_set_dmattr;
+ vecp[i].func_no = DM_FSYS_SET_EVENTLIST;
+ vecp[i++].u_fc.set_eventlist = xfs_dm_set_eventlist;
+ vecp[i].func_no = DM_FSYS_SET_FILEATTR;
+ vecp[i++].u_fc.set_fileattr = xfs_dm_set_fileattr;
+ vecp[i].func_no = DM_FSYS_SET_INHERIT;
+ vecp[i++].u_fc.set_inherit = xfs_dm_set_inherit;
+ vecp[i].func_no = DM_FSYS_SET_REGION;
+ vecp[i++].u_fc.set_region = xfs_dm_set_region;
+ vecp[i].func_no = DM_FSYS_SYMLINK_BY_HANDLE;
+ vecp[i++].u_fc.symlink_by_handle = xfs_dm_symlink_by_handle;
+ vecp[i].func_no = DM_FSYS_SYNC_BY_HANDLE;
+ vecp[i++].u_fc.sync_by_handle = xfs_dm_sync_by_handle;
+ vecp[i].func_no = DM_FSYS_UPGRADE_RIGHT;
+ vecp[i++].u_fc.upgrade_right = xfs_dm_upgrade_right;
+ vecp[i].func_no = DM_FSYS_WRITE_INVIS_RVP;
+ vecp[i++].u_fc.write_invis_rvp = xfs_dm_write_invis_rvp;
+ vecp[i].func_no = DM_FSYS_OBJ_REF_HOLD;
+ vecp[i++].u_fc.obj_ref_hold = xfs_dm_obj_ref_hold;
+
+ return(0);
+}
+
+
+/* xfs_dm_send_mmap_event - send events needed for memory mapping a file.
+ *
+ * This is a workaround called for files that are about to be
+ * mapped. DMAPI events are not being generated at a low enough level
+ * in the kernel for page reads/writes to generate the correct events.
+ * So for memory-mapped files we generate read or write events for the
+ * whole byte range being mapped. If the mmap call can never cause a
+ * write to the file, then only a read event is sent.
+ *
+ * Code elsewhere prevents adding managed regions to a file while it
+ * is still mapped.
+ */
+
+STATIC int
+xfs_dm_send_mmap_event(
+ struct vm_area_struct *vma,
+ unsigned int wantflag)
+{
+ xfs_inode_t *ip;
+ int error = 0;
+ dm_eventtype_t max_event = DM_EVENT_READ;
+ xfs_fsize_t filesize;
+ xfs_off_t length, end_of_area, evsize, offset;
+ int iolock;
+
+ if (!vma->vm_file)
+ return 0;
+
+ ip = XFS_I(vma->vm_file->f_dentry->d_inode);
+
+ if (!S_ISREG(vma->vm_file->f_dentry->d_inode->i_mode) ||
+ !(ip->i_mount->m_flags & XFS_MOUNT_DMAPI))
+ return 0;
+
+ /* If they specifically asked for 'read', then give it to them.
+ * Otherwise, see if it's possible to give them 'write'.
+ */
+ if( wantflag & VM_READ ){
+ max_event = DM_EVENT_READ;
+ }
+ else if( ! (vma->vm_flags & VM_DENYWRITE) ) {
+ if((wantflag & VM_WRITE) || (vma->vm_flags & VM_WRITE))
+ max_event = DM_EVENT_WRITE;
+ }
+
+ if( (wantflag & VM_WRITE) && (max_event != DM_EVENT_WRITE) ){
+ return -EACCES;
+ }
+
+ /* Figure out how much of the file is being requested by the user. */
+ offset = 0; /* beginning of file, for now */
+ length = 0; /* whole file, for now */
+
+ filesize = ip->i_new_size;
+ if (filesize < ip->i_size) {
+ filesize = ip->i_size;
+ }
+
+ /* Set first byte number beyond the map area. */
+
+ if (length) {
+ end_of_area = offset + length;
+ if (end_of_area > filesize)
+ end_of_area = filesize;
+ } else {
+ end_of_area = filesize;
+ }
+
+ /* Set the real amount being mapped. */
+ evsize = end_of_area - offset;
+ if (evsize < 0)
+ evsize = 0;
+
+ if (max_event == DM_EVENT_READ)
+ iolock = XFS_IOLOCK_SHARED;
+ else
+ iolock = XFS_IOLOCK_EXCL;
+
+ xfs_ilock(ip, iolock);
+ /* If write possible, try a DMAPI write event */
+ if (max_event == DM_EVENT_WRITE && DM_EVENT_ENABLED(ip, max_event)) {
+ error = xfs_dm_send_data_event(max_event, ip, offset,
+ evsize, 0, &iolock);
+ goto out_unlock;
+ }
+
+ /* Try a read event if max_event was != DM_EVENT_WRITE or if it
+ * was DM_EVENT_WRITE but the WRITE event was not enabled.
+ */
+ if (DM_EVENT_ENABLED(ip, DM_EVENT_READ)) {
+ error = xfs_dm_send_data_event(DM_EVENT_READ, ip, offset,
+ evsize, 0, &iolock);
+ }
+out_unlock:
+ xfs_iunlock(ip, iolock);
+ return -error;
+}
+
+
+STATIC int
+xfs_dm_send_destroy_event(
+ xfs_inode_t *ip,
+ dm_right_t vp_right) /* always DM_RIGHT_NULL */
+{
+ /* Returns positive errors to XFS */
+ return -dm_send_destroy_event(&ip->i_vnode, vp_right);
+}
+
+
+STATIC int
+xfs_dm_send_namesp_event(
+ dm_eventtype_t event,
+ struct xfs_mount *mp,
+ xfs_inode_t *ip1,
+ dm_right_t vp1_right,
+ xfs_inode_t *ip2,
+ dm_right_t vp2_right,
+ const char *name1,
+ const char *name2,
+ mode_t mode,
+ int retcode,
+ int flags)
+{
+ /* Returns positive errors to XFS */
+ return -dm_send_namesp_event(event, mp ? mp->m_super : NULL,
+ &ip1->i_vnode, vp1_right,
+ ip2 ? &ip2->i_vnode : NULL, vp2_right,
+ name1, name2,
+ mode, retcode, flags);
+}
+
+STATIC int
+xfs_dm_send_mount_event(
+ struct xfs_mount *mp,
+ dm_right_t root_right,
+ char *mtpt,
+ char *fsname)
+{
+ return dm_send_mount_event(mp->m_super, root_right,
+ NULL, DM_RIGHT_NULL,
+ mp->m_rootip ? VFS_I(mp->m_rootip) : NULL,
+ DM_RIGHT_NULL, mtpt, fsname);
+}
+
+STATIC void
+xfs_dm_send_unmount_event(
+ struct xfs_mount *mp,
+ xfs_inode_t *ip, /* NULL if unmount successful */
+ dm_right_t vfsp_right,
+ mode_t mode,
+ int retcode, /* errno, if unmount failed */
+ int flags)
+{
+ dm_send_unmount_event(mp->m_super, ip ? &ip->i_vnode : NULL,
+ vfsp_right, mode, retcode, flags);
+}
+
+
+/*
+ * Data migration operations accessed by the rest of XFS.
+ * When DMAPI support is configured in, this vector is used.
+ */
+
+xfs_dmops_t xfs_dmcore_xfs = {
+ .xfs_send_data = xfs_dm_send_data_event,
+ .xfs_send_mmap = xfs_dm_send_mmap_event,
+ .xfs_send_destroy = xfs_dm_send_destroy_event,
+ .xfs_send_namesp = xfs_dm_send_namesp_event,
+ .xfs_send_mount = xfs_dm_send_mount_event,
+ .xfs_send_unmount = xfs_dm_send_unmount_event,
+};
+EXPORT_SYMBOL(xfs_dmcore_xfs);
+
+STATIC int
+xfs_dm_fh_to_inode(
+ struct super_block *sb,
+ struct inode **inode,
+ dm_fid_t *dmfid)
+{
+ xfs_mount_t *mp = XFS_M(sb);
+ xfs_inode_t *ip;
+ xfs_ino_t ino;
+ unsigned int igen;
+ int error;
+
+ *inode = NULL;
+
+ if (!dmfid->dm_fid_len) {
+ /* filesystem handle */
+ *inode = igrab(&mp->m_rootip->i_vnode);
+ if (!*inode)
+ return -ENOENT;
+ return 0;
+ }
+
+ if (dmfid->dm_fid_len != sizeof(*dmfid) - sizeof(dmfid->dm_fid_len))
+ return -EINVAL;
+
+ ino = dmfid->dm_fid_ino;
+ igen = dmfid->dm_fid_gen;
+
+ /* fail requests for ino 0 gracefully. */
+ if (ino == 0)
+ return -ESTALE;
+
+ error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+ if (error)
+ return -error;
+ if (!ip)
+ return -EIO;
+
+ if (!ip->i_d.di_mode || ip->i_d.di_gen != igen) {
+ xfs_iput_new(ip, XFS_ILOCK_SHARED);
+ return -ENOENT;
+ }
+
+ *inode = &ip->i_vnode;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return 0;
+}
+
+STATIC int
+xfs_dm_inode_to_fh(
+ struct inode *inode,
+ dm_fid_t *dmfid,
+ dm_fsid_t *dmfsid)
+{
+ xfs_inode_t *ip = XFS_I(inode);
+
+ /* Returns negative errors to DMAPI */
+
+ if (ip->i_mount->m_fixedfsid == NULL)
+ return -EINVAL;
+
+ dmfid->dm_fid_len = sizeof(dm_fid_t) - sizeof(dmfid->dm_fid_len);
+ dmfid->dm_fid_pad = 0;
+ /*
+ * use memcpy because the inode is a long long and there's no
+ * assurance that dmfid->dm_fid_ino is properly aligned.
+ */
+ memcpy(&dmfid->dm_fid_ino, &ip->i_ino, sizeof(dmfid->dm_fid_ino));
+ dmfid->dm_fid_gen = ip->i_d.di_gen;
+
+ memcpy(dmfsid, ip->i_mount->m_fixedfsid, sizeof(*dmfsid));
+ return 0;
+}
+
+STATIC void
+xfs_dm_get_fsid(
+ struct super_block *sb,
+ dm_fsid_t *fsid)
+{
+ memcpy(fsid, XFS_M(sb)->m_fixedfsid, sizeof(*fsid));
+}
+
+/*
+ * Filesystem operations accessed by the DMAPI core.
+ */
+static struct filesystem_dmapi_operations xfs_dmapiops = {
+ .get_fsys_vector = xfs_dm_get_dmapiops,
+ .fh_to_inode = xfs_dm_fh_to_inode,
+ .inode_to_fh = xfs_dm_inode_to_fh,
+ .get_fsid = xfs_dm_get_fsid,
+};
+
+static int __init
+xfs_dm_init(void)
+{
+ printk(KERN_INFO "SGI XFS Data Management API subsystem\n");
+
+ dmapi_register(&xfs_fs_type, &xfs_dmapiops);
+ return 0;
+}
+
+static void __exit
+xfs_dm_exit(void)
+{
+ dmapi_unregister(&xfs_fs_type);
+}
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("SGI XFS dmapi subsystem");
+MODULE_LICENSE("GPL");
+
+module_init(xfs_dm_init);
+module_exit(xfs_dm_exit);
#include <linux/dcache.h>
static const struct vm_operations_struct xfs_file_vm_ops;
+#ifdef HAVE_DMAPI
+static struct vm_operations_struct xfs_dmapi_file_vm_ops;
+#endif
- STATIC ssize_t
- xfs_file_aio_read(
- struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+ /*
+ * xfs_iozero
+ *
+ * xfs_iozero clears the specified range of buffer supplied,
+ * and marks all the affected blocks as valid and modified. If
+ * an affected block is not allocated, it will be allocated. If
+ * an affected block is not completely overwritten, and is not
+ * valid before the operation, it will be read from disk before
+ * being partially zeroed.
+ */
+ STATIC int
+ xfs_iozero(
+ struct xfs_inode *ip, /* inode */
+ loff_t pos, /* offset in file */
+ size_t count) /* size of data to zero */
{
- struct file *file = iocb->ki_filp;
- int ioflags = 0;
+ struct page *page;
+ struct address_space *mapping;
+ int status;
- BUG_ON(iocb->ki_pos != pos);
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
- return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
- nr_segs, &iocb->ki_pos, ioflags);
+ mapping = VFS_I(ip)->i_mapping;
+ do {
+ unsigned offset, bytes;
+ void *fsdata;
+
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
+
+ zero_user(page, offset, bytes);
+
+ status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+ page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ pos += bytes;
+ count -= bytes;
+ status = 0;
+ } while (count);
+
+ return (-status);
}
-STATIC int
-xfs_file_fsync(
- struct file *file,
- struct dentry *dentry,
- int datasync)
++int
++xfs_fsync(struct inode *inode, int datasync)
+ {
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
++ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_trans *tp;
+ int error = 0;
+ int log_flushed = 0;
+
+ xfs_itrace_entry(ip);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -XFS_ERROR(EIO);
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+ /*
+ * We always need to make sure that the required inode state is safe on
+ * disk. The inode might be clean but we still might need to force the
+ * log because of committed transactions that haven't hit the disk yet.
+ * Likewise, there could be unflushed non-transactional changes to the
+ * inode core that have to go to disk and this requires us to issue
+ * a synchronous transaction to capture these changes correctly.
+ *
+ * This code relies on the assumption that if the i_update_core field
+ * of the inode is clear and the inode is unpinned then it is clean
+ * and no action is required.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+ /*
+ * First check if the VFS inode is marked dirty. All the dirtying
+ * of non-transactional updates no goes through mark_inode_dirty*,
+ * which allows us to distinguish beteeen pure timestamp updates
+ * and i_size updates which need to be caught for fdatasync.
+ * After that also theck for the dirty state in the XFS inode, which
+ * might gets cleared when the inode gets written out via the AIL
+ * or xfs_iflush_cluster.
+ */
- if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
- ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
++ if (((inode->i_state & I_DIRTY_DATASYNC) ||
++ ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+ ip->i_update_core) {
+ /*
+ * Kick off a transaction to log the inode core to get the
+ * updates. The sync transaction will also force the log.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0,
+ XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return -error;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ /*
+ * Note - it's possible that we might have pushed ourselves out
+ * of the way during trans_reserve which would flush the inode.
+ * But there's no guarantee that the inode buffer has actually
+ * gone out yet (it's delwri). Plus the buffer could be pinned
+ * anyway if it's part of an inode in another recent
+ * transaction. So we play it safe and fire off the
+ * transaction anyway.
+ */
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ihold(tp, ip);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_set_sync(tp);
+ error = _xfs_trans_commit(tp, 0, &log_flushed);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ } else {
+ /*
+ * Timestamps/size haven't changed since last inode flush or
+ * inode transaction commit. That means either nothing got
+ * written or a transaction committed which caught the updates.
+ * If the latter happened and the transaction hasn't hit the
+ * disk yet, the inode will be still be pinned. If it is,
+ * force the log.
+ */
+ if (xfs_ipincount(ip)) {
+ error = _xfs_log_force_lsn(ip->i_mount,
+ ip->i_itemp->ili_last_lsn,
+ XFS_LOG_SYNC, &log_flushed);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ }
+
+ if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+ /*
+ * If the log write didn't issue an ordered tag we need
+ * to flush the disk cache for the data device now.
+ */
+ if (!log_flushed)
+ xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+
+ /*
+ * If this inode is on the RT dev we need to flush that
+ * cache as well.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+ }
+
+ return -error;
+ }
+
++STATIC int
++xfs_file_fsync(
++ struct file *file,
++ struct dentry *dentry,
++ int datasync)
++{
++ return xfs_fsync(dentry->d_inode, datasync);
++}
++
++
STATIC ssize_t
- xfs_file_aio_write(
+ xfs_file_aio_read(
struct kiocb *iocb,
- const struct iovec *iov,
+ const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos)
{
return -xfs_release(XFS_I(inode));
}
- /*
- * We ignore the datasync flag here because a datasync is effectively
- * identical to an fsync. That is, datasync implies that we need to write
- * only the metadata needed to be able to access the data that is written
- * if we crash after the call completes. Hence if we are writing beyond
- * EOF we have to log the inode size change as well, which makes it a
- * full fsync. If we don't write beyond EOF, the inode core will be
- * clean in memory and so we don't need to log the inode, just like
- * fsync.
- */
- STATIC int
- xfs_file_fsync(
- struct file *file,
- struct dentry *dentry,
- int datasync)
- {
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
-
- xfs_iflags_clear(ip, XFS_ITRUNCATED);
- return -xfs_fsync(ip);
- }
-
+#ifdef HAVE_DMAPI
+STATIC int
+xfs_vm_fault(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
+
+ ASSERT_ALWAYS(mp->m_flags & XFS_MOUNT_DMAPI);
+
+ if (XFS_SEND_MMAP(mp, vma, 0))
+ return VM_FAULT_SIGBUS;
+ return filemap_fault(vma, vmf);
+}
+#endif /* HAVE_DMAPI */
+
STATIC int
xfs_file_readdir(
struct file *filp,
extern void xfs_setup_inode(struct xfs_inode *);
++extern int xfs_fsync(struct inode *, int);
++
#endif /* __XFS_IOPS_H__ */
++
}
/*
- * xfs_fsync
- *
- * This is called to sync the inode and its data out to disk. We need to hold
- * the I/O lock while flushing the data, and the inode lock while flushing the
- * inode. The inode lock CANNOT be held while flushing the data, so acquire
- * after we're done with that.
- */
- int
- xfs_fsync(
- xfs_inode_t *ip)
- {
- xfs_trans_t *tp;
- int error = 0;
- int log_flushed = 0, changed = 1;
-
- xfs_itrace_entry(ip);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return XFS_ERROR(EIO);
-
- /*
- * We always need to make sure that the required inode state is safe on
- * disk. The inode might be clean but we still might need to force the
- * log because of committed transactions that haven't hit the disk yet.
- * Likewise, there could be unflushed non-transactional changes to the
- * inode core that have to go to disk and this requires us to issue
- * a synchronous transaction to capture these changes correctly.
- *
- * This code relies on the assumption that if the update_* fields
- * of the inode are clear and the inode is unpinned then it is clean
- * and no action is required.
- */
- xfs_ilock(ip, XFS_ILOCK_SHARED);
-
- if (!ip->i_update_core) {
- /*
- * Timestamps/size haven't changed since last inode flush or
- * inode transaction commit. That means either nothing got
- * written or a transaction committed which caught the updates.
- * If the latter happened and the transaction hasn't hit the
- * disk yet, the inode will be still be pinned. If it is,
- * force the log.
- */
-
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (xfs_ipincount(ip)) {
- error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
- XFS_LOG_FORCE | XFS_LOG_SYNC,
- &log_flushed);
- } else {
- /*
- * If the inode is not pinned and nothing has changed
- * we don't need to flush the cache.
- */
- changed = 0;
- }
- } else {
- /*
- * Kick off a transaction to log the inode core to get the
- * updates. The sync transaction will also force the log.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0,
- XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Note - it's possible that we might have pushed ourselves out
- * of the way during trans_reserve which would flush the inode.
- * But there's no guarantee that the inode buffer has actually
- * gone out yet (it's delwri). Plus the buffer could be pinned
- * anyway if it's part of an inode in another recent
- * transaction. So we play it safe and fire off the
- * transaction anyway.
- */
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = _xfs_trans_commit(tp, 0, &log_flushed);
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
- /*
- * If the log write didn't issue an ordered tag we need
- * to flush the disk cache for the data device now.
- */
- if (!log_flushed)
- xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
-
- /*
- * If this inode is on the RT dev we need to flush that
- * cache as well.
- */
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
- }
-
- return error;
- }
-
- /*
- * Flags for xfs_free_eofblocks
- */
-#define XFS_FREE_EOF_TRYLOCK (1<<0)
-
-/*
* This is called by xfs_inactive to free any blocks beyond eof
* when the link count isn't zero and by xfs_dm_punch_hole() when
* punching a hole to EOF.
#define MAX_PHYS_SEGMENTS 128
#define MAX_HW_SEGMENTS 128
#define SAFE_MAX_SECTORS 255
+ #define MAX_SEGMENT_SIZE 65536
+
+ enum blk_default_limits {
+ BLK_MAX_SEGMENTS = 128,
+ BLK_SAFE_MAX_SECTORS = 255,
+#ifndef CONFIG_KERNEL_DESKTOP
- #define BLK_DEF_MAX_SECTORS 2048
++ BLK_DEF_MAX_SECTORS = 2048,
+#else
- #define BLK_DEF_MAX_SECTORS 1024
+ BLK_DEF_MAX_SECTORS = 1024,
+#endif
-
- #define MAX_SEGMENT_SIZE 65536
-
- #define BLK_SEG_BOUNDARY_MASK 0xFFFFFFFFUL
+ BLK_MAX_SEGMENT_SIZE = 65536,
+ BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
+ };
#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
+#ifdef CONFIG_NETVM
+ __u8 emergency:1;
+#endif
- #ifdef CONFIG_XEN
- __u8 proto_data_valid:1,
- proto_csum_blank:1;
- #endif
kmemcheck_bitfield_end(flags2);
- /* 0/9...15 bit hole */
+ /* 0/14 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
void **freelist; /* Pointer to first free per cpu object */
struct page *page; /* The slab from which we are allocating */
int node; /* The node of the page (or -1 for debug) */
- unsigned int offset; /* Freepointer offset (in word units) */
- unsigned int objsize; /* Size of an object (from kmem_cache) */
+ int reserve; /* Did the current page come from the reserve */
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
config HAVE_UNSTABLE_SCHED_CLOCK
bool
- config GROUP_SCHED
- bool "Group CPU scheduler"
- depends on EXPERIMENTAL
- default n if KERNEL_DESKTOP
- default y
- help
- This feature lets CPU scheduler recognize task groups and control CPU
- bandwidth allocation to such task groups.
- In order to create a group from arbitrary set of processes, use
- CONFIG_CGROUPS. (See Control Group support.)
-
- config FAIR_GROUP_SCHED
- bool "Group scheduling for SCHED_OTHER"
- depends on GROUP_SCHED
- default GROUP_SCHED
-
- config RT_GROUP_SCHED
- bool "Group scheduling for SCHED_RR/FIFO"
- depends on EXPERIMENTAL
- depends on GROUP_SCHED
- default n
- help
- This feature lets you explicitly allocate real CPU bandwidth
- to users or control groups (depending on the "Basis for grouping tasks"
- setting below. If enabled, it will also make it impossible to
- schedule realtime tasks for non-root users until you allocate
- realtime bandwidth for them.
- See Documentation/scheduler/sched-rt-group.txt for more information.
-
- choice
- depends on GROUP_SCHED
- prompt "Basis for grouping tasks"
- default USER_SCHED
-
- config USER_SCHED
- bool "user id"
- help
- This option will choose userid as the basis for grouping
- tasks, thus providing equal CPU bandwidth to each user.
-
- config CGROUP_SCHED
- bool "Control groups"
- depends on CGROUPS
- help
- This option allows you to create arbitrary task groups
- using the "cgroup" pseudo filesystem and control
- the cpu bandwidth allocated to each such task group.
- Refer to Documentation/cgroups/cgroups.txt for more
- information on "cgroup" pseudo filesystem.
-
- endchoice
-
menuconfig CGROUPS
boolean "Control Group support"
+ default n if KERNEL_DESKTOP
+ default y
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
size is 4096bytes, 512k per 1Gbytes of swap.
+ menuconfig CGROUP_SCHED
+ bool "Group CPU scheduler"
+ depends on EXPERIMENTAL && CGROUPS
- default n
++ default n if KERNEL_DESKTOP
++ default y
+ help
+ This feature lets CPU scheduler recognize task groups and control CPU
+ bandwidth allocation to such task groups. It uses cgroups to group
+ tasks.
+
+ if CGROUP_SCHED
+ config FAIR_GROUP_SCHED
+ bool "Group scheduling for SCHED_OTHER"
+ depends on CGROUP_SCHED
+ default CGROUP_SCHED
+
+ config RT_GROUP_SCHED
+ bool "Group scheduling for SCHED_RR/FIFO"
+ depends on EXPERIMENTAL
+ depends on CGROUP_SCHED
+ default n
+ help
+ This feature lets you explicitly allocate real CPU bandwidth
+ to users or control groups (depending on the "Basis for grouping tasks"
+ setting below. If enabled, it will also make it impossible to
+ schedule realtime tasks for non-root users until you allocate
+ realtime bandwidth for them.
+ See Documentation/scheduler/sched-rt-group.txt for more information.
+
+ endif #CGROUP_SCHED
+
endif # CGROUPS
config MM_OWNER
See Documentation/nommu-mmap.txt for more information.
+config DEFAULT_VM_DIRTY_RATIO
+ int "Default VM dirty ratio (in %)"
+ default 20 if KERNEL_DESKTOP
+ default 40
+ help
+ Allows to tune VM dirty ratio to suit different workloads. Increased
+ VM dirty ratio improves performance of most server workloads that
+ dirties a lot of memory (e.g. simple databases not using direct IO,
+ workloads doing heavy writes). The latency-sensitive workloads like
+ desktop and typical workstations perform better with a decreased
+ VM dirty ratio.
+
+ Recommended value for desktop workload is 20.
+ Recommended value for server workload is 40.
+
+ Only use this if you really know what you are doing.
+
config PROFILING
- bool "Profiling support (EXPERIMENTAL)"
+ bool "Profiling support"
help
Say Y here to enable the extended profiling support mechanisms used
by profilers such as OProfile.
spin_lock(&mq_lock);
if (u->mq_bytes + mq_bytes < u->mq_bytes ||
u->mq_bytes + mq_bytes >
- p->signal->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
+ task_rlimit(p, RLIMIT_MSGQUEUE)) {
spin_unlock(&mq_lock);
+ kfree(info->messages);
goto out_inode;
}
u->mq_bytes += mq_bytes;
--- /dev/null
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/types.h>
+#include <linux/kdb.h>
+#include <linux/kdbprivate.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+#include <asm/pgtable.h>
+
+MODULE_AUTHOR("SGI");
+MODULE_DESCRIPTION("Debug VM information");
+MODULE_LICENSE("GPL");
+
+struct __vmflags {
+ unsigned long mask;
+ char *name;
+};
+
+static struct __vmflags vmflags[] = {
+ { VM_READ, "VM_READ " },
+ { VM_WRITE, "VM_WRITE " },
+ { VM_EXEC, "VM_EXEC " },
+ { VM_SHARED, "VM_SHARED " },
+ { VM_MAYREAD, "VM_MAYREAD " },
+ { VM_MAYWRITE, "VM_MAYWRITE " },
+ { VM_MAYEXEC, "VM_MAYEXEC " },
+ { VM_MAYSHARE, "VM_MAYSHARE " },
+ { VM_GROWSDOWN, "VM_GROWSDOWN " },
+ { VM_GROWSUP, "VM_GROWSUP " },
+ { VM_PFNMAP, "VM_PFNMAP " },
+ { VM_DENYWRITE, "VM_DENYWRITE " },
+ { VM_EXECUTABLE, "VM_EXECUTABLE " },
+ { VM_LOCKED, "VM_LOCKED " },
+ { VM_IO, "VM_IO " },
+ { VM_SEQ_READ, "VM_SEQ_READ " },
+ { VM_RAND_READ, "VM_RAND_READ " },
+ { VM_DONTCOPY, "VM_DONTCOPY " },
+ { VM_DONTEXPAND, "VM_DONTEXPAND " },
+ { VM_RESERVED, "VM_RESERVED " },
+ { VM_ACCOUNT, "VM_ACCOUNT " },
+ { VM_HUGETLB, "VM_HUGETLB " },
+ { VM_NONLINEAR, "VM_NONLINEAR " },
+ { VM_MAPPED_COPY, "VM_MAPPED_COPY " },
+ { VM_INSERTPAGE, "VM_INSERTPAGE " },
+ { 0, "" }
+};
+
+static int
+kdbm_print_vm(struct vm_area_struct *vp, unsigned long addr, int verbose_flg)
+{
+ struct __vmflags *tp;
+
+ kdb_printf("struct vm_area_struct at 0x%lx for %d bytes\n",
+ addr, (int) sizeof (struct vm_area_struct));
+
+ kdb_printf("vm_start = 0x%p vm_end = 0x%p\n", (void *) vp->vm_start,
+ (void *) vp->vm_end);
+ kdb_printf("vm_page_prot = 0x%llx\n",
+ (unsigned long long)pgprot_val(vp->vm_page_prot));
+
+ kdb_printf("vm_flags: ");
+ for (tp = vmflags; tp->mask; tp++) {
+ if (vp->vm_flags & tp->mask) {
+ kdb_printf(" %s", tp->name);
+ }
+ }
+ kdb_printf("\n");
+
+ if (!verbose_flg)
+ return 0;
+
+ kdb_printf("vm_mm = 0x%p\n", (void *) vp->vm_mm);
+ kdb_printf("vm_next = 0x%p\n", (void *) vp->vm_next);
+ kdb_printf("shared.vm_set.list.next = 0x%p\n", (void *) vp->shared.vm_set.list.next);
+ kdb_printf("shared.vm_set.list.prev = 0x%p\n", (void *) vp->shared.vm_set.list.prev);
+ kdb_printf("shared.vm_set.parent = 0x%p\n", (void *) vp->shared.vm_set.parent);
+ kdb_printf("shared.vm_set.head = 0x%p\n", (void *) vp->shared.vm_set.head);
- kdb_printf("anon_vma_node.next = 0x%p\n", (void *) vp->anon_vma_node.next);
- kdb_printf("anon_vma_node.prev = 0x%p\n", (void *) vp->anon_vma_node.prev);
++ kdb_printf("anon_vma_chain.next = 0x%p\n", (void *) vp->anon_vma_chain.next);
++ kdb_printf("anon_vma_chain.prev = 0x%p\n", (void *) vp->anon_vma_chain.prev);
+ kdb_printf("vm_ops = 0x%p\n", (void *) vp->vm_ops);
+ if (vp->vm_ops != NULL) {
+ kdb_printf("vm_ops->open = 0x%p\n", vp->vm_ops->open);
+ kdb_printf("vm_ops->close = 0x%p\n", vp->vm_ops->close);
+ kdb_printf("vm_ops->fault = 0x%p\n", vp->vm_ops->fault);
+#ifdef HAVE_VMOP_MPROTECT
+ kdb_printf("vm_ops->mprotect = 0x%p\n", vp->vm_ops->mprotect);
+#endif
+#ifdef CONFIG_NUMA
+ kdb_printf("vm_ops->set_policy = 0x%p\n", vp->vm_ops->set_policy);
+ kdb_printf("vm_ops->get_policy = 0x%p\n", vp->vm_ops->get_policy);
+#endif
+ }
+ kdb_printf("vm_pgoff = 0x%lx\n", vp->vm_pgoff);
+ kdb_printf("vm_file = 0x%p\n", (void *) vp->vm_file);
+ kdb_printf("vm_private_data = 0x%p\n", vp->vm_private_data);
+#ifdef CONFIG_NUMA
+ kdb_printf("vm_policy = 0x%p\n", vp->vm_policy);
+#endif
+
+ return 0;
+}
+
+static int
+kdbm_print_vmp(struct vm_area_struct *vp, int verbose_flg)
+{
+ struct __vmflags *tp;
+
+ if (verbose_flg) {
+ kdb_printf("0x%lx: ", (unsigned long) vp);
+ }
+
+ kdb_printf("0x%p 0x%p ", (void *) vp->vm_start, (void *) vp->vm_end);
+
+ for (tp = vmflags; tp->mask; tp++) {
+ if (vp->vm_flags & tp->mask) {
+ kdb_printf(" %s", tp->name);
+ }
+ }
+ kdb_printf("\n");
+
+ return 0;
+}
+
+
+#ifdef CONFIG_NUMA
+#include <linux/mempolicy.h>
+
+/*
+ * kdbm_mpol
+ *
+ * This function implements the 'mempolicy' command.
+ * Print a struct mempolicy.
+ *
+ * mempolicy <address> Print struct mempolicy at <address>
+ */
+static int
+kdbm_mpol(int argc, const char **argv)
+{
+ unsigned long addr;
+ long offset = 0;
+ int nextarg;
+ int err = 0;
+ struct mempolicy *mp = NULL;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((err = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ NULL)) != 0)
+ return(err);
+
+ if (!(mp = kmalloc(sizeof(*mp), GFP_ATOMIC))) {
+ kdb_printf("%s: cannot kmalloc mp\n", __FUNCTION__);
+ goto out;
+ }
+
+ if ((err = kdb_getarea(*mp, addr))) {
+ kdb_printf("%s: invalid mempolicy address\n", __FUNCTION__);
+ goto out;
+ }
+
+ kdb_printf("struct mempolicy at 0x%p\n", (struct mempolicy *)addr);
+ kdb_printf(" refcnt %d\n", atomic_read(&mp->refcnt));
+
+ switch (mp->mode) {
+ case MPOL_DEFAULT:
+ kdb_printf(" mode %d (MPOL_DEFAULT)\n", mp->mode);
+ break;
+
+ case MPOL_PREFERRED:
+ kdb_printf(" mode %d (MPOL_PREFERRED)\n", mp->mode);
+ if (mp->flags & MPOL_F_LOCAL)
+ kdb_printf(" preferred_node local\n");
+ else
+ kdb_printf(" preferred_node %d\n", mp->v.preferred_node);
+ break;
+
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ {
+ int i, nlongs;
+ unsigned long *longp;
+
+ kdb_printf(" mode %d (%s)\n", mp->mode,
+ mp->mode == MPOL_INTERLEAVE
+ ? "MPOL_INTERLEAVE"
+ : "MPOL_BIND");
+ nlongs = (int)BITS_TO_LONGS(MAX_NUMNODES);
+ kdb_printf(" nodes:");
+ longp = mp->v.nodes.bits;
+ for (i = 0; i < nlongs; i++, longp++)
+ kdb_printf(" 0x%lx ", *longp);
+ kdb_printf("\n");
+ break;
+ }
+
+ default:
+ kdb_printf(" mode %d (unknown)\n", mp->mode);
+ break;
+ }
+out:
+ if (mp)
+ kfree(mp);
+ return err;
+}
+
+#endif /* CONFIG_NUMA */
+
+/*
+ * kdbm_pgdat
+ *
+ * This function implements the 'pgdat' command.
+ * Print a struct pglist_data (pg_dat_t).
+ *
+ * pgdat <node_id> Print struct pglist_data for node <node_id>.
+ *
+ * Print pglist_data for node 0 if node_id not specified,
+ * or print the one pglist_data structure if !CONFIG_NUMA.
+ */
+static int
+kdbm_pgdat(int argc, const char **argv)
+{
+ int err = 0, node_id = 0, i;
+ pg_data_t *pgdatp = NULL;
+
+#ifdef CONFIG_NUMA
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+ if (argc == 1) {
+ int nextarg;
+ long offset = 0;
+ unsigned long node_id_ul;
+
+ nextarg = 1;
+ if ((err = kdbgetaddrarg(argc, argv, &nextarg, &node_id_ul,
+ &offset, NULL)) != 0) {
+ return(err);
+ }
+ node_id = (int)node_id_ul;
+ }
+#endif
+ for_each_online_pgdat(pgdatp) {
+ if (pgdatp->node_id == node_id)
+ break;
+ }
+ if (!pgdatp) {
+ kdb_printf("%s: specified node not found\n", __FUNCTION__);
+ return 0;
+ }
+ kdb_printf("struct pglist_data at 0x%p node_id = %d\n",
+ pgdatp, pgdatp->node_id);
+
+ for (i = 0; i < MAX_ZONELISTS; i++) {
+ int zr;
+ struct zoneref *zonerefp;
+ struct zone *zonep;
+
+ zonerefp = pgdatp->node_zonelists[i]._zonerefs;
+ kdb_printf(" _zonerefs[%d] at 0x%p\n", i, zonerefp);
+
+ for (zr = 0; zr <= MAX_ZONES_PER_ZONELIST; zr++, zonerefp++) {
+ int z;
+ pg_data_t *tmp_pgdatp;
+
+ zonep = zonelist_zone(zonerefp);
+ if (!zonep)
+ break;
+
+ kdb_printf(" 0x%p", zonep);
+
+ for_each_online_pgdat(tmp_pgdatp) {
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ if (zonep == &tmp_pgdatp->node_zones[z]) {
+ kdb_printf (" (node %d node_zones[%d])",
+ tmp_pgdatp->node_id, z);
+ break;
+ }
+ }
+ if (z != MAX_NR_ZONES)
+ break; /* found it */
+ }
+ kdb_printf("\n");
+ }
+ }
+
+ kdb_printf(" nr_zones = %d", pgdatp->nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ kdb_printf(" node_mem_map = 0x%p\n", pgdatp->node_mem_map);
+#endif
++#ifndef CONFIG_NO_BOOTMEM
+ kdb_printf(" bdata = 0x%p", pgdatp->bdata);
++#endif
+ kdb_printf(" node_start_pfn = 0x%lx\n", pgdatp->node_start_pfn);
+ kdb_printf(" node_present_pages = %ld (0x%lx)\n",
+ pgdatp->node_present_pages, pgdatp->node_present_pages);
+ kdb_printf(" node_spanned_pages = %ld (0x%lx)\n",
+ pgdatp->node_spanned_pages, pgdatp->node_spanned_pages);
+ kdb_printf(" kswapd = 0x%p\n", pgdatp->kswapd);
+
+ return err;
+}
+
+/*
+ * kdbm_vm
+ *
+ * This function implements the 'vm' command. Print a vm_area_struct.
+ *
+ * vm [-v] <address> Print vm_area_struct at <address>
+ * vmp [-v] <pid> Print all vm_area_structs for <pid>
+ */
+
+static int
+kdbm_vm(int argc, const char **argv)
+{
+ unsigned long addr;
+ long offset = 0;
+ int nextarg;
+ int diag;
+ int verbose_flg = 0;
+
+ if (argc == 2) {
+ if (strcmp(argv[1], "-v") != 0) {
+ return KDB_ARGCOUNT;
+ }
+ verbose_flg = 1;
+ } else if (argc != 1) {
+ return KDB_ARGCOUNT;
+ }
+
+ if (strcmp(argv[0], "vmp") == 0) {
+ struct task_struct *g, *tp;
+ struct vm_area_struct *vp;
+ pid_t pid;
+
+ if ((diag = kdbgetularg(argv[argc], (unsigned long *) &pid)))
+ return diag;
+
+ kdb_do_each_thread(g, tp) {
+ if (tp->pid == pid) {
+ if (tp->mm != NULL) {
+ if (verbose_flg)
+ kdb_printf
+ ("vm_area_struct ");
+ kdb_printf
+ ("vm_start vm_end vm_flags\n");
+ vp = tp->mm->mmap;
+ while (vp != NULL) {
+ kdbm_print_vmp(vp, verbose_flg);
+ vp = vp->vm_next;
+ }
+ }
+ return 0;
+ }
+ } kdb_while_each_thread(g, tp);
+
+ kdb_printf("No process with pid == %d found\n", pid);
+
+ } else {
+ struct vm_area_struct v;
+
+ nextarg = argc;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ NULL))
+ || (diag = kdb_getarea(v, addr)))
+ return (diag);
+
+ kdbm_print_vm(&v, addr, verbose_flg);
+ }
+
+ return 0;
+}
+
+static int
+kdbm_print_pte(pte_t * pte)
+{
+ kdb_printf("0x%lx (", (unsigned long) pte_val(*pte));
+
+ if (pte_present(*pte)) {
+#ifdef pte_exec
+ if (pte_exec(*pte))
+ kdb_printf("X");
+#endif
+ if (pte_write(*pte))
+ kdb_printf("W");
+#ifdef pte_read
+ if (pte_read(*pte))
+ kdb_printf("R");
+#endif
+ if (pte_young(*pte))
+ kdb_printf("A");
+ if (pte_dirty(*pte))
+ kdb_printf("D");
+
+ } else {
+ kdb_printf("OFFSET=0x%lx ", swp_offset(pte_to_swp_entry(*pte)));
+ kdb_printf("TYPE=0x%ulx", swp_type(pte_to_swp_entry(*pte)));
+ }
+
+ kdb_printf(")");
+
+ /* final newline is output by caller of kdbm_print_pte() */
+
+ return 0;
+}
+
+/*
+ * kdbm_pte
+ *
+ * This function implements the 'pte' command. Print all pte_t structures
+ * that map to the given virtual address range (<address> through <address>
+ * plus <nbytes>) for the given process. The default value for nbytes is
+ * one.
+ *
+ * pte -m <mm> <address> [<nbytes>] Print all pte_t structures for
+ * virtual <address> in address space
+ * of <mm> which is a pointer to a
+ * mm_struct
+ * pte -p <pid> <address> [<nbytes>] Print all pte_t structures for
+ * virtual <address> in address space
+ * of <pid>
+ */
+
+static int
+kdbm_pte(int argc, const char **argv)
+{
+ unsigned long addr;
+ long offset = 0;
+ int nextarg;
+ unsigned long nbytes = 1;
+ long npgs;
+ int diag;
+ int found;
+ pid_t pid;
+ struct task_struct *tp;
+ struct mm_struct *mm, copy_of_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (argc < 3 || argc > 4) {
+ return KDB_ARGCOUNT;
+ }
+
+ if (strcmp(argv[1], "-p") == 0) {
+ if ((diag = kdbgetularg(argv[2], (unsigned long *) &pid))) {
+ return diag;
+ }
+
+ found = 0;
+ for_each_process(tp) {
+ if (tp->pid == pid) {
+ if (tp->mm != NULL) {
+ found = 1;
+ break;
+ }
+ kdb_printf("task structure's mm field is NULL\n");
+ return 0;
+ }
+ }
+
+ if (!found) {
+ kdb_printf("No process with pid == %d found\n", pid);
+ return 0;
+ }
+ mm = tp->mm;
+ } else if (strcmp(argv[1], "-m") == 0) {
+
+
+ nextarg = 2;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ NULL))
+ || (diag = kdb_getarea(copy_of_mm, addr)))
+ return (diag);
+ mm = ©_of_mm;
+ } else {
+ return KDB_ARGCOUNT;
+ }
+
+ if ((diag = kdbgetularg(argv[3], &addr))) {
+ return diag;
+ }
+
+ if (argc == 4) {
+ if ((diag = kdbgetularg(argv[4], &nbytes))) {
+ return diag;
+ }
+ }
+
+ kdb_printf("vaddr pte\n");
+
+ npgs = ((((addr & ~PAGE_MASK) + nbytes) + ~PAGE_MASK) >> PAGE_SHIFT);
+ while (npgs-- > 0) {
+
+ kdb_printf("0x%p ", (void *) (addr & PAGE_MASK));
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, addr);
+ if (pmd_present(*pmd)) {
+ pte = pte_offset_map(pmd, addr);
+ if (pte_present(*pte)) {
+ kdbm_print_pte(pte);
+ }
+ }
+ }
+ }
+
+ kdb_printf("\n");
+ addr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+/*
+ * kdbm_rpte
+ *
+ * This function implements the 'rpte' command. Print all pte_t structures
+ * that contain the given physical page range (<pfn> through <pfn>
+ * plus <npages>) for the given process. The default value for npages is
+ * one.
+ *
+ * rpte -m <mm> <pfn> [<npages>] Print all pte_t structures for
+ * physical page <pfn> in address space
+ * of <mm> which is a pointer to a
+ * mm_struct
+ * rpte -p <pid> <pfn> [<npages>] Print all pte_t structures for
+ * physical page <pfn> in address space
+ * of <pid>
+ */
+
+static int
+kdbm_rpte(int argc, const char **argv)
+{
+ unsigned long addr;
+ unsigned long pfn;
+ long offset = 0;
+ int nextarg;
+ unsigned long npages = 1;
+ int diag;
+ int found;
+ pid_t pid;
+ struct task_struct *tp;
+ struct mm_struct *mm, copy_of_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long g, u, m, t;
+
+ if (argc < 3 || argc > 4) {
+ return KDB_ARGCOUNT;
+ }
+
+ if (strcmp(argv[1], "-p") == 0) {
+ if ((diag = kdbgetularg(argv[2], (unsigned long *) &pid))) {
+ return diag;
+ }
+
+ found = 0;
+ for_each_process(tp) {
+ if (tp->pid == pid) {
+ if (tp->mm != NULL) {
+ found = 1;
+ break;
+ }
+ kdb_printf("task structure's mm field is NULL\n");
+ return 0;
+ }
+ }
+
+ if (!found) {
+ kdb_printf("No process with pid == %d found\n", pid);
+ return 0;
+ }
+ mm = tp->mm;
+ } else if (strcmp(argv[1], "-m") == 0) {
+
+
+ nextarg = 2;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ NULL))
+ || (diag = kdb_getarea(copy_of_mm, addr)))
+ return (diag);
+ mm = ©_of_mm;
+ } else {
+ return KDB_ARGCOUNT;
+ }
+
+ if ((diag = kdbgetularg(argv[3], &pfn))) {
+ return diag;
+ }
+
+ if (argc == 4) {
+ if ((diag = kdbgetularg(argv[4], &npages))) {
+ return diag;
+ }
+ }
+
+ /* spaces after vaddr depends on sizeof(unsigned long) */
+ kdb_printf("pfn vaddr%*s pte\n",
+ (int)(2*sizeof(unsigned long) + 2 - 5), " ");
+
+ for (g = 0, pgd = pgd_offset(mm, 0UL); g < PTRS_PER_PGD; ++g, ++pgd) {
+ if (pgd_none(*pgd) || pgd_bad(*pgd))
+ continue;
+ for (u = 0, pud = pud_offset(pgd, 0UL); u < PTRS_PER_PUD; ++u, ++pud) {
+ if (pud_none(*pud) || pud_bad(*pud))
+ continue;
+ for (m = 0, pmd = pmd_offset(pud, 0UL); m < PTRS_PER_PMD; ++m, ++pmd) {
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ continue;
+ for (t = 0, pte = pte_offset_map(pmd, 0UL); t < PTRS_PER_PTE; ++t, ++pte) {
+ if (pte_none(*pte))
+ continue;
+ if (pte_pfn(*pte) < pfn || pte_pfn(*pte) >= (pfn + npages))
+ continue;
+ addr = g << PGDIR_SHIFT;
+#ifdef __ia64__
+ /* IA64 plays tricks with the pgd mapping to save space.
+ * This reverses pgd_index().
+ */
+ {
+ unsigned long region = g >> (PAGE_SHIFT - 6);
+ unsigned long l1index = g - (region << (PAGE_SHIFT - 6));
+ addr = (region << 61) + (l1index << PGDIR_SHIFT);
+ }
+#endif
+ addr += (m << PMD_SHIFT) + (t << PAGE_SHIFT);
+ kdb_printf("0x%-14lx " kdb_bfd_vma_fmt0 " ",
+ pte_pfn(*pte), addr);
+ kdbm_print_pte(pte);
+ kdb_printf("\n");
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+kdbm_print_dentry(unsigned long daddr)
+{
+ struct dentry d;
+ int diag;
+ char buf[256];
+
+ kdb_printf("Dentry at 0x%lx\n", daddr);
+ if ((diag = kdb_getarea(d, (unsigned long)daddr)))
+ return diag;
+
+ if ((d.d_name.len > sizeof(buf)) || (diag = kdb_getarea_size(buf, (unsigned long)(d.d_name.name), d.d_name.len)))
+ kdb_printf(" d_name.len = %d d_name.name = 0x%p\n",
+ d.d_name.len, d.d_name.name);
+ else
+ kdb_printf(" d_name.len = %d d_name.name = 0x%p <%.*s>\n",
+ d.d_name.len, d.d_name.name,
+ (int)(d.d_name.len), d.d_name.name);
+
+ kdb_printf(" d_count = %d d_flags = 0x%x d_inode = 0x%p\n",
+ atomic_read(&d.d_count), d.d_flags, d.d_inode);
+
+ kdb_printf(" d_parent = 0x%p\n", d.d_parent);
+
+ kdb_printf(" d_hash.nxt = 0x%p d_hash.prv = 0x%p\n",
+ d.d_hash.next, d.d_hash.pprev);
+
+ kdb_printf(" d_lru.nxt = 0x%p d_lru.prv = 0x%p\n",
+ d.d_lru.next, d.d_lru.prev);
+
+ kdb_printf(" d_child.nxt = 0x%p d_child.prv = 0x%p\n",
+ d.d_u.d_child.next, d.d_u.d_child.prev);
+
+ kdb_printf(" d_subdirs.nxt = 0x%p d_subdirs.prv = 0x%p\n",
+ d.d_subdirs.next, d.d_subdirs.prev);
+
+ kdb_printf(" d_alias.nxt = 0x%p d_alias.prv = 0x%p\n",
+ d.d_alias.next, d.d_alias.prev);
+
+ kdb_printf(" d_op = 0x%p d_sb = 0x%p d_fsdata = 0x%p\n",
+ d.d_op, d.d_sb, d.d_fsdata);
+
+ kdb_printf(" d_iname = %s\n",
+ d.d_iname);
+
+ if (d.d_inode) {
+ struct inode i;
+ kdb_printf("\nInode Entry at 0x%p\n", d.d_inode);
+ if ((diag = kdb_getarea(i, (unsigned long)d.d_inode)))
+ return diag;
+ kdb_printf(" i_mode = 0%o i_nlink = %d i_rdev = 0x%x\n",
+ i.i_mode, i.i_nlink, i.i_rdev);
+
+ kdb_printf(" i_ino = %ld i_count = %d\n",
+ i.i_ino, atomic_read(&i.i_count));
+
+ kdb_printf(" i_hash.nxt = 0x%p i_hash.prv = 0x%p\n",
+ i.i_hash.next, i.i_hash.pprev);
+
+ kdb_printf(" i_list.nxt = 0x%p i_list.prv = 0x%p\n",
+ i.i_list.next, i.i_list.prev);
+
+ kdb_printf(" i_dentry.nxt = 0x%p i_dentry.prv = 0x%p\n",
+ i.i_dentry.next, i.i_dentry.prev);
+
+ }
+ kdb_printf("\n");
+ return 0;
+}
+
+static int
+kdbm_filp(int argc, const char **argv)
+{
+ struct file f;
+ int nextarg;
+ unsigned long addr;
+ long offset;
+ int diag;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ (diag = kdb_getarea(f, addr)))
+ return diag;
+
+ kdb_printf("File Pointer at 0x%lx\n", addr);
+
+ kdb_printf(" fu_list.nxt = 0x%p fu_list.prv = 0x%p\n",
+ f.f_u.fu_list.next, f.f_u.fu_list.prev);
+
+ kdb_printf(" f_dentry = 0x%p f_vfsmnt = 0x%p f_op = 0x%p\n",
+ f.f_dentry, f.f_vfsmnt, f.f_op);
+
+ kdb_printf(" f_count = %ld f_flags = 0x%x f_mode = 0x%x\n",
- f.f_count, f.f_flags, f.f_mode);
++ atomic_long_read(&f.f_count), f.f_flags, f.f_mode);
+
+ kdb_printf(" f_pos = %Ld\n", f.f_pos);
+#ifdef CONFIG_SECURITY
+ kdb_printf(" security = 0x%p\n", f.f_security);
+#endif
+
+ kdb_printf(" private_data = 0x%p f_mapping = 0x%p\n\n",
+ f.private_data, f.f_mapping);
+
+ return kdbm_print_dentry((unsigned long)f.f_dentry);
+}
+
+static int
+kdbm_fl(int argc, const char **argv)
+{
+ struct file_lock fl;
+ int nextarg;
+ unsigned long addr;
+ long offset;
+ int diag;
+
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ (diag = kdb_getarea(fl, addr)))
+ return diag;
+
+ kdb_printf("File_lock at 0x%lx\n", addr);
+
+ kdb_printf(" fl_next = 0x%p fl_link.nxt = 0x%p fl_link.prv = 0x%p\n",
+ fl.fl_next, fl.fl_link.next, fl.fl_link.prev);
+ kdb_printf(" fl_block.nxt = 0x%p fl_block.prv = 0x%p\n",
+ fl.fl_block.next, fl.fl_block.prev);
+ kdb_printf(" fl_owner = 0x%p fl_pid = %d fl_wait = 0x%p\n",
+ fl.fl_owner, fl.fl_pid, &fl.fl_wait);
+ kdb_printf(" fl_file = 0x%p fl_flags = 0x%x\n",
+ fl.fl_file, fl.fl_flags);
+ kdb_printf(" fl_type = %d fl_start = 0x%llx fl_end = 0x%llx\n",
+ fl.fl_type, fl.fl_start, fl.fl_end);
+
+ kdb_printf(" file_lock_operations");
+ if (fl.fl_ops)
+ kdb_printf("\n fl_copy_lock = 0x%p fl_release_private = 0x%p\n",
+ fl.fl_ops->fl_copy_lock, fl.fl_ops->fl_release_private);
+ else
+ kdb_printf(" empty\n");
+
+ kdb_printf(" lock_manager_operations");
+ if (fl.fl_lmops)
+ kdb_printf("\n fl_compare_owner = 0x%p fl_notify = 0x%p\n",
+ fl.fl_lmops->fl_compare_owner, fl.fl_lmops->fl_notify);
+ else
+ kdb_printf(" empty\n");
+
+ kdb_printf(" fl_fasync = 0x%p fl_break 0x%lx\n",
+ fl.fl_fasync, fl.fl_break_time);
+
+ return 0;
+}
+
+
+static int
+kdbm_dentry(int argc, const char **argv)
+{
+ int nextarg;
+ unsigned long addr;
+ long offset;
+ int diag;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
+ return diag;
+
+ return kdbm_print_dentry(addr);
+}
+
+static int
+kdbm_kobject(int argc, const char **argv)
+{
+ struct kobject k;
+ int nextarg;
+ unsigned long addr;
+ long offset;
+ int diag;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ (diag = kdb_getarea(k, addr)))
+ return diag;
+
+
+ kdb_printf("kobject at 0x%lx\n", addr);
+
+ if (k.name) {
+ char c;
+ kdb_printf(" name 0x%p", k.name);
+ if (kdb_getarea(c, (unsigned long)k.name) == 0)
+ kdb_printf(" '%s'", k.name);
+ kdb_printf("\n");
+ }
+
+ if (k.name != kobject_name((struct kobject *)addr))
+ kdb_printf(" name '%.20s'\n", k.name);
+
+ kdb_printf(" kref.refcount %d'\n", atomic_read(&k.kref.refcount));
+
+ kdb_printf(" entry.next = 0x%p entry.prev = 0x%p\n",
+ k.entry.next, k.entry.prev);
+
+ kdb_printf(" parent = 0x%p kset = 0x%p ktype = 0x%p sd = 0x%p\n",
+ k.parent, k.kset, k.ktype, k.sd);
+
+ return 0;
+}
+
+static int
+kdbm_sh(int argc, const char **argv)
+{
+ int diag;
+ int nextarg;
+ unsigned long addr;
+ long offset = 0L;
+ struct Scsi_Host sh;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ (diag = kdb_getarea(sh, addr)))
+ return diag;
+
+ kdb_printf("Scsi_Host at 0x%lx\n", addr);
+ kdb_printf("host_queue = 0x%p\n", sh.__devices.next);
+ kdb_printf("ehandler = 0x%p eh_action = 0x%p\n",
+ sh.ehandler, sh.eh_action);
+ kdb_printf("host_wait = 0x%p hostt = 0x%p\n",
+ &sh.host_wait, sh.hostt);
+ kdb_printf("host_failed = %d host_no = %d resetting = %d\n",
+ sh.host_failed, sh.host_no, sh.resetting);
+ kdb_printf("max id/lun/channel = [%d/%d/%d] this_id = %d\n",
+ sh.max_id, sh.max_lun, sh.max_channel, sh.this_id);
+ kdb_printf("can_queue = %d cmd_per_lun = %d sg_tablesize = %d u_isa_dma = %d\n",
+ sh.can_queue, sh.cmd_per_lun, sh.sg_tablesize, sh.unchecked_isa_dma);
+ kdb_printf("host_blocked = %d reverse_ordering = %d \n",
+ sh.host_blocked, sh.reverse_ordering);
+
+ return 0;
+}
+
+static int
+kdbm_sd(int argc, const char **argv)
+{
+ int diag;
+ int nextarg;
+ unsigned long addr;
+ long offset = 0L;
+ struct scsi_device *sd = NULL;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
+ goto out;
+ if (!(sd = kmalloc(sizeof(*sd), GFP_ATOMIC))) {
+ kdb_printf("kdbm_sd: cannot kmalloc sd\n");
+ goto out;
+ }
+ if ((diag = kdb_getarea(*sd, addr)))
+ goto out;
+
+ kdb_printf("scsi_device at 0x%lx\n", addr);
+ kdb_printf("next = 0x%p prev = 0x%p host = 0x%p\n",
+ sd->siblings.next, sd->siblings.prev, sd->host);
+ kdb_printf("device_busy = %d current_cmnd 0x%p\n",
+ sd->device_busy, sd->current_cmnd);
+ kdb_printf("id/lun/chan = [%d/%d/%d] single_lun = %d device_blocked = %d\n",
+ sd->id, sd->lun, sd->channel, sd->sdev_target->single_lun, sd->device_blocked);
+ kdb_printf("queue_depth = %d current_tag = %d scsi_level = %d\n",
+ sd->queue_depth, sd->current_tag, sd->scsi_level);
+ kdb_printf("%8.8s %16.16s %4.4s\n", sd->vendor, sd->model, sd->rev);
+out:
+ if (sd)
+ kfree(sd);
+ return diag;
+}
+
+static int
+kdbm_sc(int argc, const char **argv)
+{
+ int diag;
+ int nextarg;
+ unsigned long addr;
+ long offset = 0L;
+ struct scsi_cmnd *sc = NULL;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
+ goto out;
+ if (!(sc = kmalloc(sizeof(*sc), GFP_ATOMIC))) {
+ kdb_printf("kdbm_sc: cannot kmalloc sc\n");
+ goto out;
+ }
+ if ((diag = kdb_getarea(*sc, addr)))
+ goto out;
+
+ kdb_printf("scsi_cmnd at 0x%lx\n", addr);
+ kdb_printf("device = 0x%p next = 0x%p\n",
+ sc->device, sc->list.next);
+ kdb_printf("serial_number = %ld retries = %d\n",
+ sc->serial_number, sc->retries);
+ kdb_printf("cmd_len = %d\n", sc->cmd_len);
+ kdb_printf("cmnd = [%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x]\n",
+ sc->cmnd[0], sc->cmnd[1], sc->cmnd[2], sc->cmnd[3], sc->cmnd[4],
+ sc->cmnd[5], sc->cmnd[6], sc->cmnd[7], sc->cmnd[8], sc->cmnd[9],
+ sc->cmnd[10], sc->cmnd[11]);
+ kdb_printf("request_buffer = 0x%p request_bufflen = %d\n",
+ scsi_sglist(sc), scsi_bufflen(sc));
+ kdb_printf("use_sg = %d\n", scsi_sg_count(sc));
+ kdb_printf("underflow = %d transfersize = %d\n",
+ sc->underflow, sc->transfersize);
+ kdb_printf("tag = %d\n", sc->tag);
+
+out:
+ if (sc)
+ kfree(sc);
+ return diag;
+}
+
+static int __init kdbm_vm_init(void)
+{
+ kdb_register("vm", kdbm_vm, "[-v] <vaddr>", "Display vm_area_struct", 0);
+ kdb_register("vmp", kdbm_vm, "[-v] <pid>", "Display all vm_area_struct for <pid>", 0);
+#ifdef CONFIG_NUMA
+ kdb_register("mempolicy", kdbm_mpol, "<vaddr>", "Display mempolicy structure", 0);
+ kdb_register("pgdat", kdbm_pgdat, "<node_id>", "Display pglist_data node structure", 0);
+#else
+ kdb_register("pgdat", kdbm_pgdat, "", "Display pglist_data node structure", 0);
+#endif
+ kdb_register("pte", kdbm_pte, "( -m <mm> | -p <pid> ) <vaddr> [<nbytes>]", "Display pte_t for mm_struct or pid", 0);
+ kdb_register("rpte", kdbm_rpte, "( -m <mm> | -p <pid> ) <pfn> [<npages>]", "Find pte_t containing pfn for mm_struct or pid", 0);
+ kdb_register("dentry", kdbm_dentry, "<dentry>", "Display interesting dentry stuff", 0);
+ kdb_register("kobject", kdbm_kobject, "<kobject>", "Display interesting kobject stuff", 0);
+ kdb_register("filp", kdbm_filp, "<filp>", "Display interesting filp stuff", 0);
+ kdb_register("fl", kdbm_fl, "<fl>", "Display interesting file_lock stuff", 0);
+ kdb_register("sh", kdbm_sh, "<vaddr>", "Show scsi_host", 0);
+ kdb_register("sd", kdbm_sd, "<vaddr>", "Show scsi_device", 0);
+ kdb_register("sc", kdbm_sc, "<vaddr>", "Show scsi_cmnd", 0);
+
+ return 0;
+}
+
+static void __exit kdbm_vm_exit(void)
+{
+ kdb_unregister("vm");
+ kdb_unregister("vmp");
+#ifdef CONFIG_NUMA
+ kdb_unregister("mempolicy");
+#endif
+ kdb_unregister("pgdat");
+ kdb_unregister("pte");
+ kdb_unregister("rpte");
+ kdb_unregister("dentry");
+ kdb_unregister("kobject");
+ kdb_unregister("filp");
+ kdb_unregister("fl");
+ kdb_unregister("sh");
+ kdb_unregister("sd");
+ kdb_unregister("sc");
+}
+
+module_init(kdbm_vm_init)
+module_exit(kdbm_vm_exit)
#include <asm/system.h>
#include <asm/sections.h>
+#ifdef CONFIG_KDB_KDUMP
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kdb.h>
+#endif
+
- #ifndef CONFIG_XEN
/* Per cpu memory for storing cpu states in case of system crash. */
- note_buf_t* crash_notes;
- #endif
+ note_buf_t __percpu *crash_notes;
/* vmcoreinfo stuff */
static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
* We can't use the "normal" timers since we just panicked.
*/
printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
-
+#ifdef CONFIG_BOOTSPLASH
+ {
+ extern int splash_verbose(void);
+ (void)splash_verbose();
+ }
+#endif
- for (i = 0; i < panic_timeout*1000; ) {
+ for (i = 0; i < panic_timeout; i++) {
touch_nmi_watchdog();
- i += panic_blink(i);
- mdelay(1);
- i++;
+ panic_blink_one_second();
}
/*
* This will not be a clean reboot, with everything
}
#endif
local_irq_enable();
+#ifdef CONFIG_BOOTSPLASH
+ {
+ extern int splash_verbose(void);
+ (void)splash_verbose();
+ }
+#endif
- for (i = 0; ; ) {
+ while (1) {
touch_softlockup_watchdog();
- i += panic_blink(i);
- mdelay(1);
- i++;
+ panic_blink_one_second();
}
}
#include <linux/kexec.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
+ #include <linux/syslog.h>
+#include <linux/jhash.h>
+#include <linux/device.h>
#include <asm/uaccess.h>
SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
- return do_syslog(type, buf, len);
+ return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
}
+#ifdef CONFIG_DEBUG_KERNEL
+/* Its very handy to be able to view the syslog buffer during debug.
+ * But do_syslog() uses locks so it cannot be used during debugging.
+ * Instead, provide the start and end of the physical and logical logs.
+ * This is equivalent to do_syslog(3).
+ */
+void debugger_syslog_data(char *syslog_data[4])
+{
+ syslog_data[0] = log_buf;
+ syslog_data[1] = log_buf + log_buf_len;
+ syslog_data[2] = log_buf + log_end - (logged_chars < log_buf_len ? logged_chars : log_buf_len);
+ syslog_data[3] = log_buf + log_end;
+}
+#endif /* CONFIG_DEBUG_KERNEL */
+
/*
* Call the console drivers on a range of log_buf
*/
.mode = 0644,
.proc_handler = scan_unevictable_handler,
},
+ {
+ .procname = "heap-stack-gap",
+ .data = &heap_stack_gap,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
- #ifdef CONFIG_PRESWAP
- {
- .procname = "preswap",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = preswap_sysctl_handler,
- .extra1 = (void *)&preswap_zero,
- .extra2 = (void *)&preswap_infinity,
- },
- #endif
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
unlock_page(page);
goto out;
}
+
+ if (sis->flags & SWP_FILE) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ ret = mapping->a_ops->swap_out(swap_file, page, wbc);
+ if (!ret)
+ count_vm_event(PSWPOUT);
+ return ret;
+ }
+
- if (preswap_put(page) == 1) {
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
- goto out;
- }
-
bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
+
+ if (sis->flags & SWP_FILE) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ ret = mapping->a_ops->swap_in(swap_file, page);
+ if (!ret)
+ count_vm_event(PSWPIN);
+ return ret;
+ }
+
- if (preswap_get(page) == 1) {
- SetPageUptodate(page);
- unlock_page(page);
- goto out;
- }
-
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
object = c->page->freelist;
if (unlikely(!object))
goto another_slab;
- if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
- goto debug;
+ if (unlikely(PageSlubDebug(c->page) || c->reserve))
+ goto slow_path;
- c->freelist = object[c->offset];
+ c->freelist = get_freepointer(s, object);
c->page->inuse = c->page->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
local_irq_disable();
if (new) {
- c = get_cpu_slab(s, smp_processor_id());
+ c = __this_cpu_ptr(s->cpu_slab);
+ c->reserve = reserve;
- stat(c, ALLOC_SLAB);
+ stat(s, ALLOC_SLAB);
if (c->page)
flush_slab(s, c);
slab_lock(new);
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(s, gfpflags, node);
return NULL;
-debug:
- if (!alloc_debug_processing(s, c->page, object, addr))
+
+slow_path:
+ if (PageSlubDebug(c->page) &&
+ !alloc_debug_processing(s, c->page, object, addr))
goto another_slab;
+ /*
+ * Avoid the slub fast path in slab_alloc() by not setting
+ * c->freelist and the fast path in slab_free() by making
+ * node_match() fail by setting c->node to -1.
+ *
+ * We use this for for debug and reserve checks which need
+ * to be done for each allocation.
+ */
+
c->page->inuse++;
- c->page->freelist = object[c->offset];
+ c->page->freelist = get_freepointer(s, object);
c->node = -1;
goto unlock_out;
}
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev;
struct net_device *null_or_orig;
+ struct net_device *null_or_bond;
int ret = NET_RX_DROP;
__be16 type;
+ unsigned long pflags = current->flags;
if (!skb->tstamp.tv64)
net_timestamp(skb);
}
#endif
- #ifdef CONFIG_XEN
- switch (skb->ip_summed) {
- case CHECKSUM_UNNECESSARY:
- skb->proto_data_valid = 1;
- break;
- case CHECKSUM_PARTIAL:
- /* XXX Implement me. */
- default:
- skb->proto_data_valid = 0;
- break;
- }
- #endif
-
+ if (skb_emergency(skb))
+ goto skip_taps;
+
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev) {
ncls:
#endif
+ if (!skb_emergency_protocol(skb))
+ goto drop;
+
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
- goto out;
+ goto unlock;
skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
if (!skb)
- goto out;
+ goto unlock;
+ /*
+ * Make sure frames received on VLAN interfaces stacked on
+ * bonding interfaces still make their way to any base bonding
+ * device that may have registered for a specific ptype. The
+ * handler may have to adjust skb->dev and orig_dev.
+ */
+ null_or_bond = NULL;
+ if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
+ (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
+ null_or_bond = vlan_dev_real_dev(skb->dev);
+ }
+
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
}
#endif
- static int ipv4_frags_init_net(struct net *net)
+ static int __net_init ipv4_frags_init_net(struct net *net)
{
+ int ret;
+
/*
* Fragment cache limits. We will commit 256K at one time. Should we
* cross that limit we will prune down to 192K. This should cope with
inet_frags_init_net(&net->ipv4.frags);
- return ip4_frags_ns_ctl_register(net);
+ ret = ip4_frags_ns_ctl_register(net);
+ if (ret)
+ goto out_reg;
+
+ mem_reserve_init(&net->ipv4.frags.reserve, "IPv4 fragment cache",
+ &net_skb_reserve);
+ ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
+ net->ipv4.frags.high_thresh);
+ if (ret)
+ goto out_reserve;
+
+ return 0;
+
+out_reserve:
+ mem_reserve_disconnect(&net->ipv4.frags.reserve);
+ ip4_frags_ns_ctl_unregister(net);
+out_reg:
+ inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+
+ return ret;
}
- static void ipv4_frags_exit_net(struct net *net)
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
{
+ mem_reserve_disconnect(&net->ipv4.frags.reserve);
ip4_frags_ns_ctl_unregister(net);
inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
}
read_lock_bh(&idev->lock);
if (ifp->dead)
goto out;
- spin_lock_bh(&ifp->lock);
+ spin_lock(&ifp->lock);
if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+ !(dev->flags&IFF_MULTICAST) ||
idev->cnf.accept_dad < 1 ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
}
#endif
- static int ipv6_frags_init_net(struct net *net)
+ static int __net_init ipv6_frags_init_net(struct net *net)
{
+ int ret;
+
- net->ipv6.frags.high_thresh = 256 * 1024;
- net->ipv6.frags.low_thresh = 192 * 1024;
+ net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
inet_frags_init_net(&net->ipv6.frags);
- return ip6_frags_ns_sysctl_register(net);
+ ret = ip6_frags_ns_sysctl_register(net);
+ if (ret)
+ goto out_reg;
+
+ mem_reserve_init(&net->ipv6.frags.reserve, "IPv6 fragment cache",
+ &net_skb_reserve);
+ ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
+ net->ipv6.frags.high_thresh);
+ if (ret)
+ goto out_reserve;
+
+ return 0;
+
+out_reserve:
+ mem_reserve_disconnect(&net->ipv6.frags.reserve);
+ ip6_frags_ns_sysctl_unregister(net);
+out_reg:
+ inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+
+ return ret;
}
- static void ipv6_frags_exit_net(struct net *net)
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
{
+ mem_reserve_disconnect(&net->ipv6.frags.reserve);
ip6_frags_ns_sysctl_unregister(net);
inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
}
--- /dev/null
+/*
+ * AppArmor security module
+ *
+ * This file contains AppArmor function for pathnames
+ *
+ * Copyright (C) 1998-2008 Novell/SUSE
+ * Copyright 2009 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+
+#include "include/apparmor.h"
+#include "include/path.h"
+
+int aa_get_name_to_buffer(struct path *path, int is_dir, char *buffer, int size,
+ char **name)
+{
+ int error = d_namespace_path(path, buffer, size - is_dir, name);
+
+ if (!error && is_dir && (*name)[1] != '\0')
+ /*
+ * Append "/" to the pathname. The root directory is a special
+ * case; it already ends in slash.
+ */
+ strcpy(&buffer[size - 2], "/");
+
+ return error;
+}
+
+/**
+ * aa_get_name - compute the pathname of a file
+ * @path: path the file
+ * @is_dir: set if the file is a directory
+ * @buffer: buffer that aa_get_name() allocated
+ * @name: the error code indicating whether aa_get_name failed
+ *
+ * Returns an error code if the there was a failure in obtaining the
+ * name.
+ *
+ * @name is apointer to the beginning of the pathname (which usually differs
+ * from the beginning of the buffer), or NULL. If there is an error @name
+ * may contain a partial or invalid name (in the case of a deleted file), that
+ * can be used for audit purposes, but it can not be used for mediation.
+ *
+ * We need @is_dir to indicate whether the file is a directory or not because
+ * the file may not yet exist, and so we cannot check the inode's file type.
+ */
+int aa_get_name(struct path *path, int is_dir, char **buffer, char **name)
+{
+ char *buf, *str = NULL;
+ int size = 256;
+ int error;
+
+ *name = NULL;
+ *buffer = NULL;
+ for (;;) {
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ error = aa_get_name_to_buffer(path, is_dir, buf, size, &str);
+ if (!error || (error == -ENOENT) || (error == -ESTALE))
+ break;
+
+ kfree(buf);
+ size <<= 1;
+ if (size > g_apparmor_path_max)
+ return -ENAMETOOLONG;
+ }
+ *buffer = buf;
+ *name = str;
+
+ return error;
+}
+
++/* Only needed until d_namespace_path is cleaned up and doesn't use
++ * vfsmount_lock anymore. -jeffm */
++extern spinlock_t vfsmount_lock;
++
+int d_namespace_path(struct path *path, char *buf, int buflen, char **name)
+{
+ struct path root, tmp, ns_root = { };
+ char *res;
+ int deleted;
+ int error = 0;
+
+ read_lock(¤t->fs->lock);
+ root = current->fs->root;
+ path_get(¤t->fs->root);
+ read_unlock(¤t->fs->lock);
+ spin_lock(&vfsmount_lock);
+ if (root.mnt && root.mnt->mnt_ns)
+ ns_root.mnt = mntget(root.mnt->mnt_ns->root);
+ if (ns_root.mnt)
+ ns_root.dentry = dget(ns_root.mnt->mnt_root);
+ spin_unlock(&vfsmount_lock);
+ spin_lock(&dcache_lock);
+
+ do {
+ tmp = ns_root;
+ deleted = d_unlinked(path->dentry);
+ res = __d_path(path, &tmp, buf, buflen);
+ } while (deleted != d_unlinked(path->dentry));
+
+ *name = res;
+ /* handle error conditions - and still allow a partial path to
+ * be returned */
+ if (IS_ERR(res)) {
+ error = PTR_ERR(res);
+ *name = buf;
+ } else if (deleted) {
+ /* The stripping of (deleted) is a hack that could be removed
+ * with an updated __d_path
+ */
+
+ /* Currently 2 cases fall into here. Fixing the mediation
+ * of deleted files for things like trunc.
+ * And the newly allocated dentry case. The first case
+ * means we strip deleted for everything so the new
+ * dentry test case is commented out below.
+ */
+ buf[buflen - 11] = 0; /* - (len(" (deleted)") +\0) */
+
+ /* if (!path->dentry->d_inode) {
+ * On some filesystems, newly allocated dentries appear
+ * to the security_path hooks as a deleted
+ * dentry except without an inode allocated.
+ *
+ * Remove the appended deleted text and return as a
+ * string for normal mediation. The (deleted) string
+ * is guarenteed to be added in this case, so just
+ * strip it.
+ */
+ } else if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) {
+ error = -ENOENT;
+#if 0
+ } else if (tmp.dentry != ns_root.dentry && tmp.mnt != ns_root.mnt) {
+ /* disconnected path don return pathname starting with '/' */
+ error = -ESTALE;
+ if (*res == '/')
+ *name = res + 1;
+#endif
+ }
+
+ spin_unlock(&dcache_lock);
+ path_put(&root);
+ path_put(&ns_root);
+
+ return error;
+}
+
+char *sysctl_pathname(struct ctl_table *table, char *buffer, int buflen)
+{
+ if (buflen < 1)
+ return NULL;
+ buffer += --buflen;
+ *buffer = '\0';
+
+ while (table) {
+ int namelen = strlen(table->procname);
+
+ if (buflen < namelen + 1)
+ return NULL;
+ buflen -= namelen + 1;
+ buffer -= namelen;
+ memcpy(buffer, table->procname, namelen);
+ *--buffer = '/';
+ table = table->parent;
+ }
+ if (buflen < 4)
+ return NULL;
+ buffer -= 4;
+ memcpy(buffer, "/sys", 4);
+
+ return buffer;
+}