select HAVE_OPROFILE
select HAVE_KPROBES
select HAVE_KRETPROBES
- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
- select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
+ select HAVE_ARCH_KGDB if !X86_VOYAGER && !XEN
config ARCH_DEFCONFIG
string
config CLOCKSOURCE_WATCHDOG
def_bool y
+ depends on !XEN
config GENERIC_CLOCKEVENTS
def_bool y
+ depends on !XEN
config GENERIC_CLOCKEVENTS_BROADCAST
def_bool y
depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
+ depends on !XEN
config LOCKDEP_SUPPORT
def_bool y
config ARCH_HIBERNATION_POSSIBLE
def_bool y
depends on !SMP || !X86_VOYAGER
+ depends on !XEN
config ARCH_SUSPEND_POSSIBLE
def_bool y
bool
depends on SMP
depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64
+ depends on !XEN
default y
config X86_BIOS_REBOOT
bool
depends on !X86_VISWS && !X86_VOYAGER
+ depends on !XEN
default y
config X86_TRAMPOLINE
bool
depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
+ depends on !XEN
default y
+config X86_NO_TSS
+ def_bool y
+ depends on XEN
+
+config X86_NO_IDT
+ def_bool y
+ depends on XEN
+
config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
help
Choose this option if your computer is a standard PC or compatible.
+config X86_XEN
+ bool "Xen-compatible"
+ depends on X86_32
+ select XEN
+ select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
+ select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
+ select SWIOTLB
+ help
+ Choose this option if you plan to run this kernel on top of the
+ Xen Hypervisor.
+
config X86_ELAN
bool "AMD Elan"
depends on X86_32
as R-8610-(G).
If you don't have one of these chips, you should say N here.
+config X86_64_XEN
+ bool "Enable Xen compatible kernel"
+ depends on X86_64
+ select XEN
+ select SWIOTLB
+ help
+ This option will compile a kernel compatible with Xen hypervisor
+
config X86_VSMP
bool "Support for ScaleMP vSMP"
select PARAVIRT
menuconfig PARAVIRT_GUEST
bool "Paravirtualized guest support"
+ depends on !XEN
help
Say Y here to get to see options related to running Linux under
various hypervisors. This option alone does not add any kernel code.
config MEMTEST_BOOTPARAM
bool "Memtest boot parameter"
- depends on X86_64
+ depends on X86_64 && !XEN
default y
help
This option adds a kernel parameter 'memtest', which allows memtest
config HPET_TIMER
def_bool X86_64
prompt "HPET Timer Support" if X86_32
+ depends on !XEN
help
Use the IA-PC HPET (High Precision Event Timer) to manage
time in preference to the PIT and RTC, if a HPET is
default y
select SWIOTLB
select AGP
- depends on X86_64 && PCI
+ depends on X86_64 && PCI && !X86_64_XEN
help
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
config CALGARY_IOMMU
bool "IBM Calgary IOMMU support"
select SWIOTLB
- depends on X86_64 && PCI && EXPERIMENTAL
+ depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL
help
Support for hardware IOMMUs in IBM's xSeries x366 and x460
systems. Needed to run systems with more than 3GB of memory
range 2 255
depends on SMP
default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
+ default "16" if X86_64_XEN
default "8"
help
This allows you to specify the maximum number of CPUs which this
config X86_UP_APIC
bool "Local APIC support on uniprocessors"
- depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH)
+ depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH || XEN_UNPRIVILEGED_GUEST)
help
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH))
+ depends on !XEN_UNPRIVILEGED_GUEST
config X86_IO_APIC
def_bool y
depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH))
+ depends on !XEN_UNPRIVILEGED_GUEST
config X86_VISWS_APIC
def_bool y
depends on X86_32 && X86_VISWS
+config X86_XEN_GENAPIC
+ def_bool y
+ depends on X86_64_XEN
+
config X86_MCE
bool "Machine Check Exception"
- depends on !X86_VOYAGER
+ depends on !(X86_VOYAGER || XEN)
---help---
Machine Check Exception support allows the processor to notify the
kernel if it detects a problem (e.g. overheating, component failure).
config X86_REBOOTFIXUPS
def_bool n
prompt "Enable X86 board specific fixups for reboot"
- depends on X86_32 && X86
+ depends on X86_32 && !XEN
---help---
This enables chipset and/or board specific fixups to be done
in order to get reboot to work correctly. This is only needed on
config MICROCODE
tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
+ depends on !XEN_UNPRIVILEGED_GUEST
select FW_LOADER
---help---
If you say Y here, you will be able to update the microcode on
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
- depends on SMP
+ depends on SMP && !X86_64_XEN
depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL)
default n if X86_PC
default y if (X86_NUMAQ || X86_SUMMIT)
config ARCH_SPARSEMEM_DEFAULT
def_bool y
- depends on X86_64
+ depends on X86_64 && !X86_64_XEN
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
+ depends on (X86_64 && !X86_64_XEN) || NUMA || (EXPERIMENTAL && X86_PC)
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
config MATH_EMULATION
bool
prompt "Math emulation" if X86_32
+ depends on !XEN
---help---
Linux can emulate a math coprocessor (used for floating point
operations) if you don't have one. 486DX and Pentium processors have
config MTRR
bool "MTRR (Memory Type Range Register) support"
+ depends on !XEN_UNPRIVILEGED_GUEST
---help---
On Intel P6 family processors (Pentium Pro, Pentium II and later)
the Memory Type Range Registers (MTRRs) may be used to control
config EFI
def_bool n
prompt "EFI runtime service support"
- depends on ACPI
+ depends on ACPI && !XEN
---help---
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
config IRQBALANCE
def_bool y
prompt "Enable kernel irq balancing"
- depends on X86_32 && SMP && X86_IO_APIC
+ depends on X86_32 && SMP && X86_IO_APIC && !XEN
help
The default yes will allow the kernel to do irq load balancing.
Saying no will keep the kernel from doing irq load balancing.
config KEXEC
bool "kexec system call"
depends on X86_BIOS_REBOOT
+ depends on !XEN_UNPRIVILEGED_GUEST
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
bool "kernel crash dumps (EXPERIMENTAL)"
depends on EXPERIMENTAL
depends on X86_64 || (X86_32 && HIGHMEM)
+ depends on !XEN
help
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
config RELOCATABLE
bool "Build a relocatable kernel (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ depends on EXPERIMENTAL && !XEN
help
This builds a kernel image that retains relocation information
so it can be loaded someplace besides the default 1MB.
depends on NUMA
menu "Power management options"
- depends on !X86_VOYAGER
+ depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
config ARCH_HIBERNATION_HEADER
def_bool y
menuconfig APM
tristate "APM (Advanced Power Management) BIOS support"
- depends on X86_32 && PM_SLEEP && !X86_VISWS
+ depends on X86_32 && PM_SLEEP && !(X86_VISWS || XEN)
---help---
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
bool "PCI support" if !X86_VISWS && !X86_VSMP
depends on !X86_VOYAGER
default y
- select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
+ select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC && !XEN)
help
Find out whether you have a PCI motherboard. PCI is the name of a
bus system, i.e. the way the CPU talks to the other stuff inside
config PCI_GOBIOS
bool "BIOS"
+ depends on !XEN
config PCI_GOMMCONFIG
bool "MMConfig"
bool "OLPC"
depends on OLPC
+config PCI_GOXEN_FE
+ bool "Xen PCI Frontend"
+ depends on X86_XEN
+ help
+ The PCI device frontend driver allows the kernel to import arbitrary
+ PCI devices from a PCI backend to support PCI driver domains.
+
config PCI_GOANY
bool "Any"
config PCI_BIOS
def_bool y
- depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
+ depends on X86_32 && !(X86_VISWS || XEN) && PCI && (PCI_GOBIOS || PCI_GOANY)
# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
config PCI_DIRECT
bool "Support mmconfig PCI config space access"
depends on X86_64 && PCI && ACPI
+config XEN_PCIDEV_FRONTEND
+ def_bool y
+ prompt "Xen PCI Frontend" if X86_64
+ depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
+ select HOTPLUG
+ help
+ The PCI device frontend driver allows the kernel to import arbitrary
+ PCI devices from a PCI backend to support PCI driver domains.
+
+config XEN_PCIDEV_FE_DEBUG
+ bool "Xen PCI Frontend Debugging"
+ depends on XEN_PCIDEV_FRONTEND
+ help
+ Enables some debug statements within the PCI Frontend.
+
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
config ISA
bool "ISA support"
- depends on !(X86_VOYAGER || X86_VISWS)
+ depends on !(X86_VOYAGER || X86_VISWS || XEN)
help
Find out whether you have ISA slots on your motherboard. ISA is the
name of a bus system, i.e. the way the CPU talks to the other stuff
source "drivers/eisa/Kconfig"
config MCA
- bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
+ bool "MCA support" if !(X86_VISWS || X86_VOYAGER || XEN)
default y if X86_VOYAGER
help
MicroChannel Architecture is found in some IBM PS/2 machines and
config X86_F00F_BUG
def_bool y
- depends on M586MMX || M586TSC || M586 || M486 || M386
+ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT
config X86_WP_WORKS_OK
def_bool y
config X86_TSC
def_bool y
depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
+ depends on !X86_XEN && !X86_64_XEN
# this should be set for all -march=.. options where the compiler
# generates cmov.
config DIRECT_GBPAGES
bool "Enable gbpages-mapped kernel pagetables"
- depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
+ depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 && !XEN
help
Enable gigabyte pages support (if the CPU supports it). This can
improve the kernel's performance a tiny bit by reducing TLB
config DOUBLEFAULT
default y
bool "Enable doublefault exception handler" if EMBEDDED
- depends on X86_32
+ depends on X86_32 && !X86_NO_TSS
help
This option allows trapping of rare doublefault exceptions that
would otherwise cause a system to silently reboot. Disabling this
config KDB
bool "Built-in Kernel Debugger support"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL && !XEN
select KALLSYMS
select KALLSYMS_ALL
help
mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/
+# Xen subarch support
+mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
+mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
+
# generic subarchitecture
mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
core-y += $(fcore-y)
# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/x86/xen/
+core-$(CONFIG_PARAVIRT_XEN) += arch/x86/xen/
# lguest paravirtualization support
core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
boot := arch/x86/boot
-PHONY += zImage bzImage compressed zlilo bzlilo \
+PHONY += zImage bzImage vmlinuz compressed zlilo bzlilo \
zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
+ifdef CONFIG_XEN
+KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
+ -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(KBUILD_CPPFLAGS)
+
+ifdef CONFIG_X86_64
+LDFLAGS_vmlinux := -e startup_64
+endif
+
+# Default kernel to build
+all: vmlinuz
+
+# KBUILD_IMAGE specifies the target image being built
+KBUILD_IMAGE := $(boot)/vmlinuz
+
+vmlinuz: vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+ $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
+ $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+else
# Default kernel to build
all: bzImage
fdimage fdimage144 fdimage288 isoimage: vmlinux
$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
+endif
install:
$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
#RAMDISK := -DRAMDISK=512
-targets := vmlinux.bin setup.bin setup.elf zImage bzImage
+targets := vmlinux.bin setup.bin setup.elf zImage bzImage vmlinuz vmlinux-stripped
subdir- := compressed
setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
cp System.map $(INSTALL_PATH)/
if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
+$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE
+ $(call if_changed,gzip)
+ @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+
+$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded
+$(obj)/vmlinux-stripped: vmlinux FORCE
+ $(call if_changed,objcopy)
+
install:
sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
--- /dev/null
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <linux/linkage.h>
+
+#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
+
+ .macro IA32_ARG_FIXUP noebp=0
+ movl %edi,%r8d
+ .if \noebp
+ .else
+ movl %ebp,%r9d
+ .endif
+ xchg %ecx,%esi
+ movl %ebx,%edi
+ movl %edx,%edx /* zero extension */
+ .endm
+
+ /* clobbers %eax */
+ .macro CLEAR_RREGS
+ xorl %eax,%eax
+ movq %rax,R11(%rsp)
+ movq %rax,R10(%rsp)
+ movq %rax,R9(%rsp)
+ movq %rax,R8(%rsp)
+ .endm
+
+ .macro LOAD_ARGS32 offset
+ movl \offset(%rsp),%r11d
+ movl \offset+8(%rsp),%r10d
+ movl \offset+16(%rsp),%r9d
+ movl \offset+24(%rsp),%r8d
+ movl \offset+40(%rsp),%ecx
+ movl \offset+48(%rsp),%edx
+ movl \offset+56(%rsp),%esi
+ movl \offset+64(%rsp),%edi
+ movl \offset+72(%rsp),%eax
+ .endm
+
+ .macro CFI_STARTPROC32 simple
+ CFI_STARTPROC \simple
+ CFI_UNDEFINED r8
+ CFI_UNDEFINED r9
+ CFI_UNDEFINED r10
+ CFI_UNDEFINED r11
+ CFI_UNDEFINED r12
+ CFI_UNDEFINED r13
+ CFI_UNDEFINED r14
+ CFI_UNDEFINED r15
+ .endm
+
+/*
+ * 32bit SYSENTER instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp user stack
+ * 0(%ebp) Arg6
+ *
+ * Interrupts on.
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(ia32_sysenter_target)
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-RIP+16
+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
+ CFI_REL_OFFSET rsp,RSP-RIP+16
+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
+ CFI_REL_OFFSET rip,RIP-RIP+16
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rcx
+ movl %ebp,%ebp /* zero extension */
+ movl %eax,%eax
+ movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
+ movl $__USER32_DS,40(%rsp)
+ movq %rbp,32(%rsp)
+ movl $__USER32_CS,16(%rsp)
+ movq %r10,8(%rsp)
+ movq %rax,(%rsp)
+ cld
+ SAVE_ARGS 0,0,1
+ /* no need to do an access_ok check here because rbp has been
+ 32bit zero extended */
+1: movl (%rbp),%r9d
+ .section __ex_table,"a"
+ .quad 1b,ia32_badarg
+ .previous
+ GET_THREAD_INFO(%r10)
+ orl $TS_COMPAT,threadinfo_status(%r10)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ jnz sysenter_tracesys
+sysenter_do_call:
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja ia32_badsys
+ IA32_ARG_FIXUP 1
+ call *ia32_sys_call_table(,%rax,8)
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ jmp int_ret_from_sys_call
+
+sysenter_tracesys:
+ xchgl %r9d,%ebp
+ SAVE_REST
+ CLEAR_RREGS
+ movq %r9,R9(%rsp)
+ movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ xchgl %ebp,%r9d
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+ jmp sysenter_do_call
+ CFI_ENDPROC
+ENDPROC(ia32_sysenter_target)
+
+/*
+ * 32bit SYSCALL instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx return EIP
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
+ * %esp user stack
+ * 0(%esp) Arg6
+ *
+ * Interrupts on.
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(ia32_cstar_target)
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-RIP+16
+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
+ CFI_REL_OFFSET rsp,RSP-RIP+16
+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
+ CFI_REL_OFFSET rip,RIP-RIP+16
+ movl %eax,%eax /* zero extension */
+ movl RSP-RIP+16(%rsp),%r8d
+ SAVE_ARGS -8,1,1
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+ movl %ebp,%ecx
+ movl $__USER32_CS,CS-ARGOFFSET(%rsp)
+ movl $__USER32_DS,SS-ARGOFFSET(%rsp)
+ /* no need to do an access_ok check here because r8 has been
+ 32bit zero extended */
+ /* hardware stack frame is complete now */
+1: movl (%r8),%r9d
+ .section __ex_table,"a"
+ .quad 1b,ia32_badarg
+ .previous
+ GET_THREAD_INFO(%r10)
+ orl $TS_COMPAT,threadinfo_status(%r10)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ jnz cstar_tracesys
+cstar_do_call:
+ cmpl $IA32_NR_syscalls-1,%eax
+ ja ia32_badsys
+ IA32_ARG_FIXUP 1
+ call *ia32_sys_call_table(,%rax,8)
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ jmp int_ret_from_sys_call
+
+cstar_tracesys:
+ xchgl %r9d,%ebp
+ SAVE_REST
+ CLEAR_RREGS
+ movq %r9,R9(%rsp)
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ xchgl %ebp,%r9d
+ movl RSP-ARGOFFSET(%rsp), %r8d
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
+ jmp cstar_do_call
+END(ia32_cstar_target)
+
+ia32_badarg:
+ movq $-EFAULT,%rax
+ jmp ia32_sysret
+ CFI_ENDPROC
+
+/*
+ * Emulated IA32 system calls via int 0x80.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except %eax must be saved (but ptrace may violate that)
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts on.
+ */
+
+ENTRY(ia32_syscall)
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-RIP+16
+ /*CFI_REL_OFFSET ss,SS-RIP+16*/
+ CFI_REL_OFFSET rsp,RSP-RIP+16
+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/
+ /*CFI_REL_OFFSET cs,CS-RIP+16*/
+ CFI_REL_OFFSET rip,RIP-RIP+16
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rcx
+ movl %eax,%eax
+ movq %rax,(%rsp)
+ cld
+ /* note the registers are not zero extended to the sf.
+ this could be a problem. */
+ SAVE_ARGS 0,0,1
+ GET_THREAD_INFO(%r10)
+ orl $TS_COMPAT,threadinfo_status(%r10)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ jnz ia32_tracesys
+ia32_do_syscall:
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
+ IA32_ARG_FIXUP
+ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
+ia32_sysret:
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ jmp int_ret_from_sys_call
+
+ia32_tracesys:
+ SAVE_REST
+ CLEAR_RREGS
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ jmp ia32_do_syscall
+END(ia32_syscall)
+
+ia32_badsys:
+ movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+ jmp int_ret_from_sys_call
+
+quiet_ni_syscall:
+ movq $-ENOSYS,%rax
+ ret
+ CFI_ENDPROC
+
+ .macro PTREGSCALL label, func, arg
+ .globl \label
+\label:
+ leaq \func(%rip),%rax
+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+ jmp ia32_ptregs_common
+ .endm
+
+ CFI_STARTPROC32
+
+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
+ PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
+ PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
+ PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
+ PTREGSCALL stub32_execve, sys32_execve, %rcx
+ PTREGSCALL stub32_fork, sys_fork, %rdi
+ PTREGSCALL stub32_clone, sys32_clone, %rdx
+ PTREGSCALL stub32_vfork, sys_vfork, %rdi
+ PTREGSCALL stub32_iopl, sys_iopl, %rsi
+ PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+
+ENTRY(ia32_ptregs_common)
+ popq %r11
+ CFI_ENDPROC
+ CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-ARGOFFSET
+ CFI_REL_OFFSET rax,RAX-ARGOFFSET
+ CFI_REL_OFFSET rcx,RCX-ARGOFFSET
+ CFI_REL_OFFSET rdx,RDX-ARGOFFSET
+ CFI_REL_OFFSET rsi,RSI-ARGOFFSET
+ CFI_REL_OFFSET rdi,RDI-ARGOFFSET
+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
+/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
+/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
+ SAVE_REST
+ call *%rax
+ RESTORE_REST
+ jmp ia32_sysret /* misbalances the return cache */
+ CFI_ENDPROC
+END(ia32_ptregs_common)
+
+ .section .rodata,"a"
+ .align 8
+ia32_sys_call_table:
+ .quad sys_restart_syscall
+ .quad sys_exit
+ .quad stub32_fork
+ .quad sys_read
+ .quad sys_write
+ .quad compat_sys_open /* 5 */
+ .quad sys_close
+ .quad sys32_waitpid
+ .quad sys_creat
+ .quad sys_link
+ .quad sys_unlink /* 10 */
+ .quad stub32_execve
+ .quad sys_chdir
+ .quad compat_sys_time
+ .quad sys_mknod
+ .quad sys_chmod /* 15 */
+ .quad sys_lchown16
+ .quad quiet_ni_syscall /* old break syscall holder */
+ .quad sys_stat
+ .quad sys32_lseek
+ .quad sys_getpid /* 20 */
+ .quad compat_sys_mount /* mount */
+ .quad sys_oldumount /* old_umount */
+ .quad sys_setuid16
+ .quad sys_getuid16
+ .quad compat_sys_stime /* stime */ /* 25 */
+ .quad compat_sys_ptrace /* ptrace */
+ .quad sys_alarm
+ .quad sys_fstat /* (old)fstat */
+ .quad sys_pause
+ .quad compat_sys_utime /* 30 */
+ .quad quiet_ni_syscall /* old stty syscall holder */
+ .quad quiet_ni_syscall /* old gtty syscall holder */
+ .quad sys_access
+ .quad sys_nice
+ .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
+ .quad sys_sync
+ .quad sys32_kill
+ .quad sys_rename
+ .quad sys_mkdir
+ .quad sys_rmdir /* 40 */
+ .quad sys_dup
+ .quad sys32_pipe
+ .quad compat_sys_times
+ .quad quiet_ni_syscall /* old prof syscall holder */
+ .quad sys_brk /* 45 */
+ .quad sys_setgid16
+ .quad sys_getgid16
+ .quad sys_signal
+ .quad sys_geteuid16
+ .quad sys_getegid16 /* 50 */
+ .quad sys_acct
+ .quad sys_umount /* new_umount */
+ .quad quiet_ni_syscall /* old lock syscall holder */
+ .quad compat_sys_ioctl
+ .quad compat_sys_fcntl64 /* 55 */
+ .quad quiet_ni_syscall /* old mpx syscall holder */
+ .quad sys_setpgid
+ .quad quiet_ni_syscall /* old ulimit syscall holder */
+ .quad sys32_olduname
+ .quad sys_umask /* 60 */
+ .quad sys_chroot
+ .quad sys32_ustat
+ .quad sys_dup2
+ .quad sys_getppid
+ .quad sys_getpgrp /* 65 */
+ .quad sys_setsid
+ .quad sys32_sigaction
+ .quad sys_sgetmask
+ .quad sys_ssetmask
+ .quad sys_setreuid16 /* 70 */
+ .quad sys_setregid16
+ .quad stub32_sigsuspend
+ .quad compat_sys_sigpending
+ .quad sys_sethostname
+ .quad compat_sys_setrlimit /* 75 */
+ .quad compat_sys_old_getrlimit /* old_getrlimit */
+ .quad compat_sys_getrusage
+ .quad sys32_gettimeofday
+ .quad sys32_settimeofday
+ .quad sys_getgroups16 /* 80 */
+ .quad sys_setgroups16
+ .quad sys32_old_select
+ .quad sys_symlink
+ .quad sys_lstat
+ .quad sys_readlink /* 85 */
+ .quad sys_uselib
+ .quad sys_swapon
+ .quad sys_reboot
+ .quad compat_sys_old_readdir
+ .quad sys32_mmap /* 90 */
+ .quad sys_munmap
+ .quad sys_truncate
+ .quad sys_ftruncate
+ .quad sys_fchmod
+ .quad sys_fchown16 /* 95 */
+ .quad sys_getpriority
+ .quad sys_setpriority
+ .quad quiet_ni_syscall /* old profil syscall holder */
+ .quad compat_sys_statfs
+ .quad compat_sys_fstatfs /* 100 */
+ .quad sys_ioperm
+ .quad compat_sys_socketcall
+ .quad sys_syslog
+ .quad compat_sys_setitimer
+ .quad compat_sys_getitimer /* 105 */
+ .quad compat_sys_newstat
+ .quad compat_sys_newlstat
+ .quad compat_sys_newfstat
+ .quad sys32_uname
+ .quad stub32_iopl /* 110 */
+ .quad sys_vhangup
+ .quad quiet_ni_syscall /* old "idle" system call */
+ .quad sys32_vm86_warning /* vm86old */
+ .quad compat_sys_wait4
+ .quad sys_swapoff /* 115 */
+ .quad compat_sys_sysinfo
+ .quad sys32_ipc
+ .quad sys_fsync
+ .quad stub32_sigreturn
+ .quad stub32_clone /* 120 */
+ .quad sys_setdomainname
+ .quad sys_uname
+ .quad sys_modify_ldt
+ .quad compat_sys_adjtimex
+ .quad sys32_mprotect /* 125 */
+ .quad compat_sys_sigprocmask
+ .quad quiet_ni_syscall /* create_module */
+ .quad sys_init_module
+ .quad sys_delete_module
+ .quad quiet_ni_syscall /* 130 get_kernel_syms */
+ .quad sys32_quotactl
+ .quad sys_getpgid
+ .quad sys_fchdir
+ .quad quiet_ni_syscall /* bdflush */
+ .quad sys_sysfs /* 135 */
+ .quad sys_personality
+ .quad quiet_ni_syscall /* for afs_syscall */
+ .quad sys_setfsuid16
+ .quad sys_setfsgid16
+ .quad sys_llseek /* 140 */
+ .quad compat_sys_getdents
+ .quad compat_sys_select
+ .quad sys_flock
+ .quad sys_msync
+ .quad compat_sys_readv /* 145 */
+ .quad compat_sys_writev
+ .quad sys_getsid
+ .quad sys_fdatasync
+ .quad sys32_sysctl /* sysctl */
+ .quad sys_mlock /* 150 */
+ .quad sys_munlock
+ .quad sys_mlockall
+ .quad sys_munlockall
+ .quad sys_sched_setparam
+ .quad sys_sched_getparam /* 155 */
+ .quad sys_sched_setscheduler
+ .quad sys_sched_getscheduler
+ .quad sys_sched_yield
+ .quad sys_sched_get_priority_max
+ .quad sys_sched_get_priority_min /* 160 */
+ .quad sys32_sched_rr_get_interval
+ .quad compat_sys_nanosleep
+ .quad sys_mremap
+ .quad sys_setresuid16
+ .quad sys_getresuid16 /* 165 */
+ .quad sys32_vm86_warning /* vm86 */
+ .quad quiet_ni_syscall /* query_module */
+ .quad sys_poll
+ .quad compat_sys_nfsservctl
+ .quad sys_setresgid16 /* 170 */
+ .quad sys_getresgid16
+ .quad sys_prctl
+ .quad stub32_rt_sigreturn
+ .quad sys32_rt_sigaction
+ .quad sys32_rt_sigprocmask /* 175 */
+ .quad sys32_rt_sigpending
+ .quad compat_sys_rt_sigtimedwait
+ .quad sys32_rt_sigqueueinfo
+ .quad stub32_rt_sigsuspend
+ .quad sys32_pread /* 180 */
+ .quad sys32_pwrite
+ .quad sys_chown16
+ .quad sys_getcwd
+ .quad sys_capget
+ .quad sys_capset
+ .quad stub32_sigaltstack
+ .quad sys32_sendfile
+ .quad quiet_ni_syscall /* streams1 */
+ .quad quiet_ni_syscall /* streams2 */
+ .quad stub32_vfork /* 190 */
+ .quad compat_sys_getrlimit
+ .quad sys32_mmap2
+ .quad sys32_truncate64
+ .quad sys32_ftruncate64
+ .quad sys32_stat64 /* 195 */
+ .quad sys32_lstat64
+ .quad sys32_fstat64
+ .quad sys_lchown
+ .quad sys_getuid
+ .quad sys_getgid /* 200 */
+ .quad sys_geteuid
+ .quad sys_getegid
+ .quad sys_setreuid
+ .quad sys_setregid
+ .quad sys_getgroups /* 205 */
+ .quad sys_setgroups
+ .quad sys_fchown
+ .quad sys_setresuid
+ .quad sys_getresuid
+ .quad sys_setresgid /* 210 */
+ .quad sys_getresgid
+ .quad sys_chown
+ .quad sys_setuid
+ .quad sys_setgid
+ .quad sys_setfsuid /* 215 */
+ .quad sys_setfsgid
+ .quad sys_pivot_root
+ .quad sys_mincore
+ .quad sys_madvise
+ .quad compat_sys_getdents64 /* 220 getdents64 */
+ .quad compat_sys_fcntl64
+ .quad quiet_ni_syscall /* tux */
+ .quad quiet_ni_syscall /* security */
+ .quad sys_gettid
+ .quad sys32_readahead /* 225 */
+ .quad sys_setxattr
+ .quad sys_lsetxattr
+ .quad sys_fsetxattr
+ .quad sys_getxattr
+ .quad sys_lgetxattr /* 230 */
+ .quad sys_fgetxattr
+ .quad sys_listxattr
+ .quad sys_llistxattr
+ .quad sys_flistxattr
+ .quad sys_removexattr /* 235 */
+ .quad sys_lremovexattr
+ .quad sys_fremovexattr
+ .quad sys_tkill
+ .quad sys_sendfile64
+ .quad compat_sys_futex /* 240 */
+ .quad compat_sys_sched_setaffinity
+ .quad compat_sys_sched_getaffinity
+ .quad sys_set_thread_area
+ .quad sys_get_thread_area
+ .quad compat_sys_io_setup /* 245 */
+ .quad sys_io_destroy
+ .quad compat_sys_io_getevents
+ .quad compat_sys_io_submit
+ .quad sys_io_cancel
+ .quad sys32_fadvise64 /* 250 */
+ .quad quiet_ni_syscall /* free_huge_pages */
+ .quad sys_exit_group
+ .quad sys32_lookup_dcookie
+ .quad sys_epoll_create
+ .quad sys_epoll_ctl /* 255 */
+ .quad sys_epoll_wait
+ .quad sys_remap_file_pages
+ .quad sys_set_tid_address
+ .quad compat_sys_timer_create
+ .quad compat_sys_timer_settime /* 260 */
+ .quad compat_sys_timer_gettime
+ .quad sys_timer_getoverrun
+ .quad sys_timer_delete
+ .quad compat_sys_clock_settime
+ .quad compat_sys_clock_gettime /* 265 */
+ .quad compat_sys_clock_getres
+ .quad compat_sys_clock_nanosleep
+ .quad compat_sys_statfs64
+ .quad compat_sys_fstatfs64
+ .quad sys_tgkill /* 270 */
+ .quad compat_sys_utimes
+ .quad sys32_fadvise64_64
+ .quad quiet_ni_syscall /* sys_vserver */
+ .quad sys_mbind
+ .quad compat_sys_get_mempolicy /* 275 */
+ .quad sys_set_mempolicy
+ .quad compat_sys_mq_open
+ .quad sys_mq_unlink
+ .quad compat_sys_mq_timedsend
+ .quad compat_sys_mq_timedreceive /* 280 */
+ .quad compat_sys_mq_notify
+ .quad compat_sys_mq_getsetattr
+ .quad compat_sys_kexec_load /* reserved for kexec */
+ .quad compat_sys_waitid
+ .quad quiet_ni_syscall /* 285: sys_altroot */
+ .quad sys_add_key
+ .quad sys_request_key
+ .quad sys_keyctl
+ .quad sys_ioprio_set
+ .quad sys_ioprio_get /* 290 */
+ .quad sys_inotify_init
+ .quad sys_inotify_add_watch
+ .quad sys_inotify_rm_watch
+ .quad sys_migrate_pages
+ .quad compat_sys_openat /* 295 */
+ .quad sys_mkdirat
+ .quad sys_mknodat
+ .quad sys_fchownat
+ .quad compat_sys_futimesat
+ .quad sys32_fstatat /* 300 */
+ .quad sys_unlinkat
+ .quad sys_renameat
+ .quad sys_linkat
+ .quad sys_symlinkat
+ .quad sys_readlinkat /* 305 */
+ .quad sys_fchmodat
+ .quad sys_faccessat
+ .quad compat_sys_pselect6
+ .quad compat_sys_ppoll
+ .quad sys_unshare /* 310 */
+ .quad compat_sys_set_robust_list
+ .quad compat_sys_get_robust_list
+ .quad sys_splice
+ .quad sys32_sync_file_range
+ .quad sys_tee /* 315 */
+ .quad compat_sys_vmsplice
+ .quad compat_sys_move_pages
+ .quad sys_getcpu
+ .quad sys_epoll_pwait
+ .quad compat_sys_utimensat /* 320 */
+ .quad compat_sys_signalfd
+ .quad sys_timerfd_create
+ .quad sys_eventfd
+ .quad sys32_fallocate
+ .quad compat_sys_timerfd_settime /* 325 */
+ .quad compat_sys_timerfd_gettime
+ia32_syscall_end:
bb_is_int_reg(dst->base_rc) &&
full_register_dst) {
#ifdef CONFIG_X86_32
+#ifndef TSS_sysenter_sp0
+#define TSS_sysenter_sp0 SYSENTER_stack_sp0
+#endif
/* mov from TSS_sysenter_sp0+offset to esp to fix up the
* sysenter stack, it leaves esp well defined. mov
* TSS_ysenter_sp0+offset(%esp),%esp is followed by up to 5
obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_X86_XEN) += fixup.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
+ obj-$(CONFIG_X86_LOCAL_APIC) += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
+ obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
+
+ apic_64-$(CONFIG_XEN) += apic_32.o
+ time_64-$(CONFIG_XEN) += time_32.o
endif
+
+disabled-obj-$(CONFIG_XEN) := early-quirks.o genapic_flat_$(BITS).o genx2apic_uv_x.o \
+ hpet.o i8253.o i8259_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
+ tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
$(obj)/realmode/wakeup.bin: FORCE
$(Q)$(MAKE) $(build)=$(obj)/realmode
+disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o
int acpi_skip_timer_override __initdata;
int acpi_use_timer_override __initdata;
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
*/
enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
/* rely on all ACPI tables being in the direct mapping */
char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
unsigned long base, offset, mapped_size;
int idx;
+#ifndef CONFIG_XEN
if (phys + size < 8 * 1024 * 1024)
return __va(phys);
+#else
+ if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
+ return isa_bus_to_virt(phys);
+#endif
offset = phys & (PAGE_SIZE - 1);
mapped_size = PAGE_SIZE - offset;
return -ENODEV;
}
+#ifndef CONFIG_XEN
if (madt->address) {
acpi_lapic_addr = (u64) madt->address;
printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
madt->address);
}
+#endif
acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
static void __cpuinit acpi_register_lapic(int id, u8 enabled)
{
+#ifndef CONFIG_XEN
unsigned int ver = 0;
+#endif
if (!enabled) {
++disabled_cpus;
return;
}
+#ifndef CONFIG_XEN
#ifdef CONFIG_X86_32
if (boot_cpu_physical_apicid != -1U)
ver = apic_version[boot_cpu_physical_apicid];
#endif
generic_processor_info(id, ver);
+#endif
}
static int __init
acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
const unsigned long end)
{
+#ifndef CONFIG_XEN
struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
return -EINVAL;
acpi_lapic_addr = lapic_addr_ovr->address;
+#endif
return 0;
}
* returns 0 on success, < 0 on error
*/
+#ifndef CONFIG_XEN
static void __init acpi_register_lapic_address(unsigned long address)
{
mp_lapic_addr = address;
#endif
}
}
+#else
+#define acpi_register_lapic_address(address)
+#endif
static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
{
--- /dev/null
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ * Copyright (C) 2001-2003 Patrick Mochel
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ */
+
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+
+#include "realmode/wakeup.h"
+#include "sleep.h"
+
+#ifndef CONFIG_ACPI_PV_SLEEP
+unsigned long acpi_wakeup_address;
+unsigned long acpi_realmode_flags;
+
+/* address in low memory of the wakeup routine. */
+static unsigned long acpi_realmode;
+
+#ifdef CONFIG_64BIT
+static char temp_stack[10240];
+#endif
+#endif
+
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ *
+ * Note that this is too late to change acpi_wakeup_address.
+ */
+int acpi_save_state_mem(void)
+{
+#ifndef CONFIG_ACPI_PV_SLEEP
+ struct wakeup_header *header;
+
+ if (!acpi_realmode) {
+ printk(KERN_ERR "Could not allocate memory during boot, "
+ "S3 disabled\n");
+ return -ENOMEM;
+ }
+ memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
+
+ header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
+ if (header->signature != 0x51ee1111) {
+ printk(KERN_ERR "wakeup header does not match\n");
+ return -EINVAL;
+ }
+
+ header->video_mode = saved_video_mode;
+
+ header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+ /* GDT[0]: GDT self-pointer */
+ header->wakeup_gdt[0] =
+ (u64)(sizeof(header->wakeup_gdt) - 1) +
+ ((u64)(acpi_wakeup_address +
+ ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
+ << 16);
+ /* GDT[1]: real-mode-like code segment */
+ header->wakeup_gdt[1] = (0x009bULL << 40) +
+ ((u64)acpi_wakeup_address << 16) + 0xffff;
+ /* GDT[2]: real-mode-like data segment */
+ header->wakeup_gdt[2] = (0x0093ULL << 40) +
+ ((u64)acpi_wakeup_address << 16) + 0xffff;
+
+#ifndef CONFIG_64BIT
+ store_gdt((struct desc_ptr *)&header->pmode_gdt);
+
+ header->pmode_efer_low = nx_enabled;
+ if (header->pmode_efer_low & 1) {
+ /* This is strange, why not save efer, always? */
+ rdmsr(MSR_EFER, header->pmode_efer_low,
+ header->pmode_efer_high);
+ }
+#endif /* !CONFIG_64BIT */
+
+ header->pmode_cr0 = read_cr0();
+ header->pmode_cr4 = read_cr4();
+ header->realmode_flags = acpi_realmode_flags;
+ header->real_magic = 0x12345678;
+
+#ifndef CONFIG_64BIT
+ header->pmode_entry = (u32)&wakeup_pmode_return;
+ header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+ saved_magic = 0x12345678;
+#else /* CONFIG_64BIT */
+ header->trampoline_segment = setup_trampoline() >> 4;
+ init_rsp = (unsigned long)temp_stack + 4096;
+ initial_code = (unsigned long)wakeup_long64;
+ saved_magic = 0x123456789abcdef0;
+#endif /* CONFIG_64BIT */
+#endif
+
+ return 0;
+}
+
+/*
+ * acpi_restore_state - undo effects of acpi_save_state_mem
+ */
+void acpi_restore_state_mem(void)
+{
+}
+
+
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page from the first 1MB of memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16MB pages, but not
+ * <1MB pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+#ifndef CONFIG_ACPI_PV_SLEEP
+ if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
+ printk(KERN_ERR
+ "ACPI: Wakeup code way too big, S3 disabled.\n");
+ return;
+ }
+
+ acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
+
+ if (!acpi_realmode) {
+ printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+ return;
+ }
+
+ acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
+#endif
+}
+
+
+#ifndef CONFIG_ACPI_PV_SLEEP
+static int __init acpi_sleep_setup(char *str)
+{
+ while ((str != NULL) && (*str != '\0')) {
+ if (strncmp(str, "s3_bios", 7) == 0)
+ acpi_realmode_flags |= 1;
+ if (strncmp(str, "s3_mode", 7) == 0)
+ acpi_realmode_flags |= 2;
+ if (strncmp(str, "s3_beep", 7) == 0)
+ acpi_realmode_flags |= 4;
+ str = strchr(str, ',');
+ if (str != NULL)
+ str += strspn(str, ", \t");
+ }
+ return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
+#endif /* CONFIG_ACPI_PV_SLEEP */
--- /dev/null
+/*
+ * Local APIC handling stubs
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+
+#include <asm/smp.h>
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+int apic_verbosity;
+
+static int __init apic_set_verbosity(char *str)
+{
+ if (strcmp("debug", str) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", str) == 0)
+ apic_verbosity = APIC_VERBOSE;
+ return 1;
+}
+
+__setup("apic=", apic_set_verbosity);
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+ return -EINVAL;
+}
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ if (smp_found_config)
+ if (!skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+#endif
+
+ return 0;
+}
#include <asm/bootparam.h>
#include <asm/elf.h>
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
#include <xen/interface/xen.h>
+#endif
+#ifdef CONFIG_LGUEST_GUEST
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
+#endif
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
OFFSET(TI_exec_domain, thread_info, exec_domain);
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);
+ OFFSET(TI_cpu, thread_info, cpu);
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
OFFSET(pbe_orig_address, pbe, orig_address);
OFFSET(pbe_next, pbe, next);
+#ifndef CONFIG_X86_NO_TSS
/* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
+ DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
sizeof(struct tss_struct));
+#else
+ /* sysenter stack points directly to sp0 */
+ DEFINE(SYSENTER_stack_sp0, 0);
+#endif
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_START_mfn_list, start_info, mfn_list);
+#endif
+
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
#endif
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
ENTRY(cr8);
BLANK();
#undef ENTRY
+#ifndef CONFIG_X86_NO_TSS
DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
BLANK();
+#endif
DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
+ifneq ($(CONFIG_XEN),y)
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+endif
extern void vide(void);
__asm__(".align 4\nvide: ret");
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
#define ENABLE_C1E_MASK 0x18000000
#define CPUID_PROCESSOR_SIGNATURE 1
#define CPUID_XFAM 0x0ff00000
num_cache_leaves = 3;
}
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
if (amd_apic_timer_broken())
local_apic_timer_disabled = 1;
#endif
--- /dev/null
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/bootmem.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+#include <asm/pat.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#endif
+#include <asm/hypervisor.h>
+
+#include "cpu.h"
+
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+#ifndef CONFIG_XEN
+ /*
+ * Segments used for calling PnP BIOS have byte granularity.
+ * They code segments and data segments have fixed 64k limits,
+ * the transfer segment sizes are set at run time.
+ */
+ /* 32-bit code */
+ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+ /* 16-bit code */
+ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ /*
+ * The APM segments have byte granularity and their bases
+ * are set at run time. All have 64k limits.
+ */
+ /* 32-bit code */
+ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+ /* 16-bit code */
+ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+ /* data */
+ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+#endif
+ [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+static int cachesize_override __cpuinitdata = -1;
+static int disable_x86_serial_nr __cpuinitdata = 1;
+
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+ /* Not much we can do here... */
+ /* Check if at least it has cpuid */
+ if (c->cpuid_level == -1) {
+ /* No cpuid. It must be an ancient CPU */
+ if (c->x86 == 4)
+ strcpy(c->x86_model_id, "486");
+ else if (c->x86 == 3)
+ strcpy(c->x86_model_id, "386");
+ }
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+ unsigned int *v;
+ char *p, *q;
+
+ if (cpuid_eax(0x80000000) < 0x80000004)
+ return 0;
+
+ v = (unsigned int *) c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+
+ /* Intel chips right-justify this string for some dumb reason;
+ undo that brain damage */
+ p = q = &c->x86_model_id[0];
+ while (*p == ' ')
+ p++;
+ if (p != q) {
+ while (*p)
+ *q++ = *p++;
+ while (q <= &c->x86_model_id[48])
+ *q++ = '\0'; /* Zero-pad the rest */
+ }
+
+ return 1;
+}
+
+
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+ unsigned int n, dummy, ecx, edx, l2size;
+
+ n = cpuid_eax(0x80000000);
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+ c->x86_cache_size = (ecx>>24)+(edx>>24);
+ }
+
+ if (n < 0x80000006) /* Some chips just has a large L1. */
+ return;
+
+ ecx = cpuid_ecx(0x80000006);
+ l2size = ecx >> 16;
+
+ /* do processor-specific cache resizing */
+ if (this_cpu->c_size_cache)
+ l2size = this_cpu->c_size_cache(c, l2size);
+
+ /* Allow user to override all this if necessary. */
+ if (cachesize_override != -1)
+ l2size = cachesize_override;
+
+ if (l2size == 0)
+ return; /* Again, no L2 cache is possible */
+
+ c->x86_cache_size = l2size;
+
+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+ l2size, ecx & 0xFF);
+}
+
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
+ *
+ */
+
+/* Look up CPU names by table lookup. */
+static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+{
+ struct cpu_model_info *info;
+
+ if (c->x86_model >= 16)
+ return NULL; /* Range check */
+
+ if (!this_cpu)
+ return NULL;
+
+ info = this_cpu->c_models;
+
+ while (info && info->family) {
+ if (info->family == c->x86)
+ return info->model_names[c->x86_model];
+ info++;
+ }
+ return NULL; /* Not found */
+}
+
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
+{
+ char *v = c->x86_vendor_id;
+ int i;
+ static int printed;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ if (cpu_devs[i]) {
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+ c->x86_vendor = i;
+ if (!early)
+ this_cpu = cpu_devs[i];
+ return;
+ }
+ }
+ }
+ if (!printed) {
+ printed++;
+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
+ }
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ this_cpu = &default_cpu;
+}
+
+
+static int __init x86_fxsr_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
+
+
+static int __init x86_sep_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SEP);
+ return 1;
+}
+__setup("nosep", x86_sep_setup);
+
+
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+ u32 f1, f2;
+
+ asm("pushfl\n\t"
+ "pushfl\n\t"
+ "popl %0\n\t"
+ "movl %0,%1\n\t"
+ "xorl %2,%0\n\t"
+ "pushl %0\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl %0\n\t"
+ "popfl\n\t"
+ : "=&r" (f1), "=&r" (f2)
+ : "ir" (flag));
+
+ return ((f1^f2) & flag) != 0;
+}
+
+
+/* Probe for the CPUID instruction */
+static int __cpuinit have_cpuid_p(void)
+{
+ return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+void __init cpu_detect(struct cpuinfo_x86 *c)
+{
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ c->x86 = 4;
+ if (c->cpuid_level >= 0x00000001) {
+ u32 junk, tfms, cap0, misc;
+ cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
+ c->x86 = (tfms >> 8) & 15;
+ c->x86_model = (tfms >> 4) & 15;
+ if (c->x86 == 0xf)
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+ c->x86_mask = tfms & 15;
+ if (cap0 & (1<<19)) {
+ c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ }
+ }
+}
+static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
+{
+ u32 tfms, xlvl;
+ unsigned int ebx;
+
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ if (have_cpuid_p()) {
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ u32 capability, excap;
+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+ c->x86_capability[0] = capability;
+ c->x86_capability[4] = excap;
+ }
+
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
+ }
+ }
+
+ }
+
+}
+
+/*
+ * Do minimum CPU detection early.
+ * Fields really needed: vendor, cpuid_level, family, model, mask,
+ * cache alignment.
+ * The others are not touched to avoid unwanted side effects.
+ *
+ * WARNING: this function is only called on the BP. Don't add code here
+ * that is supposed to run on all CPUs.
+ */
+static void __init early_cpu_detect(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ c->x86_cache_alignment = 32;
+ c->x86_clflush_size = 32;
+
+ if (!have_cpuid_p())
+ return;
+
+ cpu_detect(c);
+
+ get_cpu_vendor(c, 1);
+
+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+ cpu_devs[c->x86_vendor]->c_early_init)
+ cpu_devs[c->x86_vendor]->c_early_init(c);
+
+ early_get_cap(c);
+}
+
+static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+{
+ u32 tfms, xlvl;
+ unsigned int ebx;
+
+ if (have_cpuid_p()) {
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ get_cpu_vendor(c, 0);
+ /* Initialize the standard set of capabilities */
+ /* Note that the vendor-specific code below might override */
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ u32 capability, excap;
+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+ c->x86_capability[0] = capability;
+ c->x86_capability[4] = excap;
+ c->x86 = (tfms >> 8) & 15;
+ c->x86_model = (tfms >> 4) & 15;
+ if (c->x86 == 0xf)
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+ c->x86_mask = tfms & 15;
+ c->initial_apicid = (ebx >> 24) & 0xFF;
+#ifdef CONFIG_X86_HT
+ c->apicid = phys_pkg_id(c->initial_apicid, 0);
+ c->phys_proc_id = c->initial_apicid;
+#else
+ c->apicid = c->initial_apicid;
+#endif
+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+ c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
+ } else {
+ /* Have CPUID level 0 only - unheard of */
+ c->x86 = 4;
+ }
+
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
+ }
+ if (xlvl >= 0x80000004)
+ get_model_name(c); /* Default name */
+ }
+
+ init_scattered_cpuid_features(c);
+ }
+
+}
+
+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
+ /* Disable processor serial number */
+ unsigned long lo, hi;
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ printk(KERN_NOTICE "CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
+ }
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+ disable_x86_serial_nr = 0;
+ return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+
+
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = -1;
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ c->cpuid_level = -1; /* CPUID not detected */
+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_vendor_id[0] = '\0'; /* Unset */
+ c->x86_model_id[0] = '\0'; /* Unset */
+ c->x86_max_cores = 1;
+ c->x86_clflush_size = 32;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+ if (!have_cpuid_p()) {
+ /*
+ * First of all, decide if this is a 486 or higher
+ * It's a 486 if we can modify the AC flag
+ */
+ if (flag_is_changeable_p(X86_EFLAGS_AC))
+ c->x86 = 4;
+ else
+ c->x86 = 3;
+ }
+
+ generic_identify(c);
+
+ if (this_cpu->c_identify)
+ this_cpu->c_identify(c);
+
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+ * features a certain CPU supports which CPUID doesn't
+ * tell us, CPUID claiming incorrect flags, or other bugs,
+ * we handle them here.
+ *
+ * At the end of this section, c->x86_capability better
+ * indicate the features this CPU genuinely supports!
+ */
+ if (this_cpu->c_init)
+ this_cpu->c_init(c);
+
+ /* Disable the PN if appropriate */
+ squash_the_stupid_serial_number(c);
+
+ /*
+ * The vendor-specific functions might have changed features. Now
+ * we do "generic changes."
+ */
+
+ /* If the model name is still unset, do table lookup. */
+ if (!c->x86_model_id[0]) {
+ char *p;
+ p = table_lookup_model(c);
+ if (p)
+ strcpy(c->x86_model_id, p);
+ else
+ /* Last resort... */
+ sprintf(c->x86_model_id, "%02x/%02x",
+ c->x86, c->x86_model);
+ }
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+ * common between the CPUs. The first time this routine gets
+ * executed, c == &boot_cpu_data.
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+ for (i = 0 ; i < NCAPINTS ; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+ }
+
+ /* Clear all flags overriden by options */
+ for (i = 0; i < NCAPINTS; i++)
+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
+ /* Init Machine Check Exception if available. */
+ mcheck_init(c);
+
+ select_idle_routine(c);
+}
+
+void __init identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+ sysenter_setup();
+ enable_sep_cpu();
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ enable_sep_cpu();
+ mtrr_ap_init();
+}
+
+#ifdef CONFIG_X86_HT
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+ int index_msb, core_bits;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+ if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ return;
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+ if (smp_num_siblings == 1) {
+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+ } else if (smp_num_siblings > 1) {
+
+ if (smp_num_siblings > NR_CPUS) {
+ printk(KERN_WARNING "CPU: Unsupported number of the "
+ "siblings %d", smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
+
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
+
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings) ;
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
+
+ if (c->x86_max_cores > 1)
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ }
+}
+#endif
+
+static __init int setup_noclflush(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+ return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+ char *vendor = NULL;
+
+ if (c->x86_vendor < X86_VENDOR_NUM)
+ vendor = this_cpu->c_vendor;
+ else if (c->cpuid_level >= 0)
+ vendor = c->x86_vendor_id;
+
+ if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
+ printk("%s ", vendor);
+
+ if (!c->x86_model_id[0])
+ printk("%d86", c->x86);
+ else
+ printk("%s", c->x86_model_id);
+
+ if (c->x86_mask || c->cpuid_level >= 0)
+ printk(" stepping %02x\n", c->x86_mask);
+ else
+ printk("\n");
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+ int bit;
+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+ setup_clear_cpu_cap(bit);
+ else
+ return 0;
+ return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+void __init early_cpu_init(void)
+{
+ struct cpu_vendor_dev *cvdev;
+
+ for (cvdev = __x86cpuvendor_start ;
+ cvdev < __x86cpuvendor_end ;
+ cvdev++)
+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+
+ early_cpu_detect();
+ validate_pat_support(&boot_cpu_data);
+}
+
+/* Make sure %fs is initialized properly in idle threads */
+struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
+{
+ memset(regs, 0, sizeof(struct pt_regs));
+ regs->fs = __KERNEL_PERCPU;
+ return regs;
+}
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+ struct desc_ptr gdt_descr;
+ unsigned long va, frames[16];
+ int f;
+
+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.size = GDT_SIZE - 1;
+
+ for (va = gdt_descr.address, f = 0;
+ va < gdt_descr.address + gdt_descr.size;
+ va += PAGE_SIZE, f++) {
+ frames[f] = virt_to_mfn(va);
+ make_lowmem_page_readonly(
+ (void *)va, XENFEAT_writable_descriptor_tables);
+ }
+ if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
+ BUG();
+ asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
+}
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __cpuinit cpu_init(void)
+{
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
+#ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+#endif
+ struct thread_struct *thread = &curr->thread;
+
+ if (cpu_test_and_set(cpu, cpu_initialized)) {
+ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
+ for (;;) local_irq_enable();
+ }
+
+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+ if (cpu_has_vme || cpu_has_de)
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ switch_to_new_gdt();
+
+ /*
+ * Set up and load the per-CPU TSS and LDT
+ */
+ atomic_inc(&init_mm.mm_count);
+ curr->active_mm = &init_mm;
+ if (curr->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, curr);
+
+ load_sp0(t, thread);
+
+ load_LDT(&init_mm.context);
+
+#ifdef CONFIG_DOUBLEFAULT
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
+
+ /* Clear %gs. */
+ asm volatile ("mov %0, %%gs" : : "r" (0));
+
+ /* Clear all 6 debug registers: */
+ set_debugreg(0, 0);
+ set_debugreg(0, 1);
+ set_debugreg(0, 2);
+ set_debugreg(0, 3);
+ set_debugreg(0, 6);
+ set_debugreg(0, 7);
+
+ /*
+ * Force FPU initialization:
+ */
+ current_thread_info()->status = 0;
+ clear_used_math();
+ mxcsr_feature_mask_init();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void __cpuinit cpu_uninit(void)
+{
+ int cpu = raw_smp_processor_id();
+ cpu_clear(cpu, cpu_initialized);
+
+ /* lazy TLB state */
+ per_cpu(cpu_tlbstate, cpu).state = 0;
+ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+#endif
obj-y := main.o if.o generic.o state.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+obj-$(CONFIG_XEN) := main.o if.o
--- /dev/null
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include <linux/mutex.h>
+
+#include <asm/mtrr.h>
+#include "mtrr.h"
+
+static DEFINE_MUTEX(mtrr_mutex);
+
+void generic_get_mtrr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type * type)
+{
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_read_memtype;
+ op.u.read_memtype.reg = reg;
+ if (unlikely(HYPERVISOR_platform_op(&op)))
+ memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
+
+ *size = op.u.read_memtype.nr_mfns;
+ *base = op.u.read_memtype.mfn;
+ *type = op.u.read_memtype.type;
+}
+
+struct mtrr_ops generic_mtrr_ops = {
+ .use_intel_if = 1,
+ .get = generic_get_mtrr,
+};
+
+struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
+unsigned int num_var_ranges;
+unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+
+static u64 tom2;
+
+static void __init set_num_var_ranges(void)
+{
+ struct xen_platform_op op;
+
+ for (num_var_ranges = 0; ; num_var_ranges++) {
+ op.cmd = XENPF_read_memtype;
+ op.u.read_memtype.reg = num_var_ranges;
+ if (HYPERVISOR_platform_op(&op) != 0)
+ break;
+ }
+}
+
+static void __init init_table(void)
+{
+ int i, max;
+
+ max = num_var_ranges;
+ for (i = 0; i < max; i++)
+ mtrr_usage_table[i] = 0;
+}
+
+int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ int error;
+ struct xen_platform_op op;
+
+ mutex_lock(&mtrr_mutex);
+
+ op.cmd = XENPF_add_memtype;
+ op.u.add_memtype.mfn = base;
+ op.u.add_memtype.nr_mfns = size;
+ op.u.add_memtype.type = type;
+ error = HYPERVISOR_platform_op(&op);
+ if (error) {
+ mutex_unlock(&mtrr_mutex);
+ BUG_ON(error > 0);
+ return error;
+ }
+
+ if (increment)
+ ++mtrr_usage_table[op.u.add_memtype.reg];
+
+ mutex_unlock(&mtrr_mutex);
+
+ return op.u.add_memtype.reg;
+}
+
+static int mtrr_check(unsigned long base, unsigned long size)
+{
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+ printk(KERN_WARNING
+ "mtrr: size and base must be multiples of 4 kiB\n");
+ printk(KERN_DEBUG
+ "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ dump_stack();
+ return -1;
+ }
+ return 0;
+}
+
+int
+mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+ bool increment)
+{
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+ increment);
+}
+
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+ unsigned i;
+ mtrr_type ltype;
+ unsigned long lbase, lsize;
+ int error = -EINVAL;
+ struct xen_platform_op op;
+
+ mutex_lock(&mtrr_mutex);
+
+ if (reg < 0) {
+ /* Search for existing MTRR */
+ for (i = 0; i < num_var_ranges; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, <ype);
+ if (lbase == base && lsize == size) {
+ reg = i;
+ break;
+ }
+ }
+ if (reg < 0) {
+ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
+ size);
+ goto out;
+ }
+ }
+ if (mtrr_usage_table[reg] < 1) {
+ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
+ goto out;
+ }
+ if (--mtrr_usage_table[reg] < 1) {
+ op.cmd = XENPF_del_memtype;
+ op.u.del_memtype.handle = 0;
+ op.u.del_memtype.reg = reg;
+ error = HYPERVISOR_platform_op(&op);
+ if (error) {
+ BUG_ON(error > 0);
+ goto out;
+ }
+ }
+ error = reg;
+ out:
+ mutex_unlock(&mtrr_mutex);
+ return error;
+}
+
+int
+mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+
+EXPORT_SYMBOL(mtrr_add);
+EXPORT_SYMBOL(mtrr_del);
+
+/*
+ * Returns the effective MTRR type for the region
+ * Error returns:
+ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
+ * - 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+ int i, error;
+ u64 start_mfn, end_mfn, base_mfn, top_mfn;
+ u8 prev_match, curr_match;
+ struct xen_platform_op op;
+
+ if (!num_var_ranges)
+ return 0xFF;
+
+ start_mfn = start >> PAGE_SHIFT;
+ /* Make end inclusive end, instead of exclusive */
+ end_mfn = --end >> PAGE_SHIFT;
+
+ /* Look in fixed ranges. Just return the type as per start */
+ if (start_mfn < 0x100) {
+#if 0//todo
+ op.cmd = XENPF_read_memtype;
+ op.u.read_memtype.reg = ???;
+ error = HYPERVISOR_platform_op(&op);
+ if (!error)
+ return op.u.read_memtype.type;
+#endif
+ return MTRR_TYPE_UNCACHABLE;
+ }
+
+ /*
+ * Look in variable ranges
+ * Look of multiple ranges matching this address and pick type
+ * as per MTRR precedence
+ */
+ prev_match = 0xFF;
+ for (i = 0; i < num_var_ranges; ++i) {
+ op.cmd = XENPF_read_memtype;
+ op.u.read_memtype.reg = i;
+ error = HYPERVISOR_platform_op(&op);
+
+ if (error || !op.u.read_memtype.nr_mfns)
+ continue;
+
+ base_mfn = op.u.read_memtype.mfn;
+ top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
+
+ if (base_mfn > end_mfn || start_mfn > top_mfn) {
+ continue;
+ }
+
+ if (base_mfn > start_mfn || end_mfn > top_mfn) {
+ return 0xFE;
+ }
+
+ curr_match = op.u.read_memtype.type;
+ if (prev_match == 0xFF) {
+ prev_match = curr_match;
+ continue;
+ }
+
+ if (prev_match == MTRR_TYPE_UNCACHABLE ||
+ curr_match == MTRR_TYPE_UNCACHABLE) {
+ return MTRR_TYPE_UNCACHABLE;
+ }
+
+ if ((prev_match == MTRR_TYPE_WRBACK &&
+ curr_match == MTRR_TYPE_WRTHROUGH) ||
+ (prev_match == MTRR_TYPE_WRTHROUGH &&
+ curr_match == MTRR_TYPE_WRBACK)) {
+ prev_match = MTRR_TYPE_WRTHROUGH;
+ curr_match = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (prev_match != curr_match) {
+ return MTRR_TYPE_UNCACHABLE;
+ }
+ }
+
+ if (tom2) {
+ if (start >= (1ULL<<32) && (end < tom2))
+ return MTRR_TYPE_WRBACK;
+ }
+
+ if (prev_match != 0xFF)
+ return prev_match;
+
+#if 0//todo
+ op.cmd = XENPF_read_def_memtype;
+ error = HYPERVISOR_platform_op(&op);
+ if (!error)
+ return op.u.read_def_memtype.type;
+#endif
+ return MTRR_TYPE_UNCACHABLE;
+}
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+ u32 l, h;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return 0;
+ if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ return 0;
+ /* In case some hypervisor doesn't pass SYSCFG through */
+ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ return 0;
+ /*
+ * Memory between 4GB and top of mem is forced WB by this magic bit.
+ * Reserved before K8RevF, but should be zero there.
+ */
+ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+ (Tom2Enabled | Tom2ForceMemTypeWB))
+ return 1;
+ return 0;
+}
+
+void __init mtrr_bp_init(void)
+{
+ if (amd_special_default_mtrr()) {
+ /* TOP_MEM2 */
+ rdmsrl(MSR_K8_TOP_MEM2, tom2);
+ tom2 &= 0xffffff8000000ULL;
+ }
+}
+
+void mtrr_ap_init(void)
+{
+}
+
+static int __init mtrr_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (!is_initial_xendomain())
+ return -ENODEV;
+
+ if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+ return -ENODEV;
+
+ set_num_var_ranges();
+ init_table();
+
+ return 0;
+}
+
+subsys_initcall(mtrr_init);
#include <mach_ipi.h>
+#ifndef CONFIG_XEN
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
/* There are no cpus to shootdown */
}
#endif
+#endif /* CONFIG_XEN */
void native_machine_crash_shutdown(struct pt_regs *regs)
{
/* The kernel is broken so disable interrupts */
local_irq_disable();
+#ifndef CONFIG_XEN
/* Make a note of crashing cpu. Will be used in NMI callback.*/
crashing_cpu = safe_smp_processor_id();
nmi_shootdown_cpus();
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
#endif
+#endif /* CONFIG_XEN */
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
--- /dev/null
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/uaccess.h>
+#include <linux/suspend.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <xen/interface/memory.h>
+
+struct e820map __initdata e820;
+struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+};
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+extern int user_defined_memmap;
+
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .end = 0xfffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .end = 0xeffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+ .name = "Adapter ROM",
+ .start = 0xc8000,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .end = 0xc7fff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+ const unsigned short * const ptr = (const unsigned short *)rom;
+ unsigned short sig;
+
+ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+ unsigned char sum, c;
+
+ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ sum += c;
+ return !length && !sum;
+}
+
+static void __init probe_roms(void)
+{
+ const unsigned char *rom;
+ unsigned long start, length, upper;
+ unsigned char c;
+ int i;
+
+#ifdef CONFIG_XEN
+ /* Nothing to do if not running in dom0. */
+ if (!is_initial_xendomain())
+ return;
+#endif
+
+ /* video rom */
+ upper = adapter_rom_resources[0].start;
+ for (start = video_rom_resource.start; start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ video_rom_resource.start = start;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+
+ request_resource(&iomem_resource, &video_rom_resource);
+ break;
+ }
+
+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ request_resource(&iomem_resource, &system_rom_resource);
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ if (romchecksum(rom, length)) {
+ request_resource(&iomem_resource, &extension_rom_resource);
+ upper = extension_rom_resource.start;
+ }
+ }
+
+ /* check for adapter roms on 2k boundaries */
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ adapter_rom_resources[i].start = start;
+ adapter_rom_resources[i].end = start + length - 1;
+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+ start = adapter_rom_resources[i++].end & ~2047UL;
+ }
+}
+
+#ifdef CONFIG_XEN
+static struct e820map machine_e820;
+#define e820 machine_e820
+#endif
+
+/*
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+void __init init_iomem_resources(struct resource *code_resource,
+ struct resource *data_resource,
+ struct resource *bss_resource)
+{
+ int i;
+
+ probe_roms();
+ for (i = 0; i < e820.nr_map; i++) {
+ struct resource *res;
+#ifndef CONFIG_RESOURCES_64BIT
+ if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+ continue;
+#endif
+ res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+ switch (e820.map[i].type) {
+ case E820_RAM: res->name = "System RAM"; break;
+ case E820_ACPI: res->name = "ACPI Tables"; break;
+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
+ default: res->name = "reserved";
+ }
+ res->start = e820.map[i].addr;
+ res->end = res->start + e820.map[i].size - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ if (request_resource(&iomem_resource, res)) {
+ kfree(res);
+ continue;
+ }
+ if (e820.map[i].type == E820_RAM) {
+ /*
+ * We don't know which RAM region contains kernel data,
+ * so we try it repeatedly and let the resource manager
+ * test it.
+ */
+#ifndef CONFIG_XEN
+ request_resource(res, code_resource);
+ request_resource(res, data_resource);
+ request_resource(res, bss_resource);
+#endif
+#ifdef CONFIG_KEXEC
+ if (crashk_res.start != crashk_res.end)
+ request_resource(res, &crashk_res);
+#ifdef CONFIG_XEN
+ xen_machine_kexec_register_resources(res);
+#endif
+#endif
+ }
+ }
+}
+
+#undef e820
+
+#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
+/**
+ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
+ * correspond to e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+ int i;
+ unsigned long pfn;
+
+ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (pfn < PFN_UP(ei->addr))
+ register_nosave_region(pfn, PFN_UP(ei->addr));
+
+ pfn = PFN_DOWN(ei->addr + ei->size);
+ if (ei->type != E820_RAM)
+ register_nosave_region(PFN_UP(ei->addr), pfn);
+
+ if (pfn >= max_low_pfn)
+ break;
+ }
+}
+#endif
+
+void __init add_memory_region(unsigned long long start,
+ unsigned long long size, int type)
+{
+ int x;
+
+ x = e820.nr_map;
+
+ if (x == E820MAX) {
+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ return;
+ }
+
+ e820.map[x].addr = start;
+ e820.map[x].size = size;
+ e820.map[x].type = type;
+ e820.nr_map++;
+} /* add_memory_region */
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+ struct change_member *change_tmp;
+ unsigned long current_type, last_type;
+ unsigned long long last_addr;
+ int chgidx, still_changing;
+ int overlap_entries;
+ int new_bios_entry;
+ int old_nr, new_nr, chg_nr;
+ int i;
+
+ /*
+ Visually we're performing the following (1,2,3,4 = memory types)...
+
+ Sample memory map (w/overlaps):
+ ____22__________________
+ ______________________4_
+ ____1111________________
+ _44_____________________
+ 11111111________________
+ ____________________33__
+ ___________44___________
+ __________33333_________
+ ______________22________
+ ___________________2222_
+ _________111111111______
+ _____________________11_
+ _________________4______
+
+ Sanitized equivalent (no overlap):
+ 1_______________________
+ _44_____________________
+ ___1____________________
+ ____22__________________
+ ______11________________
+ _________1______________
+ __________3_____________
+ ___________44___________
+ _____________33_________
+ _______________2________
+ ________________1_______
+ _________________4______
+ ___________________2____
+ ____________________33__
+ ______________________4_
+ */
+ /* if there's only one memory region, don't bother */
+ if (*pnr_map < 2) {
+ return -1;
+ }
+
+ old_nr = *pnr_map;
+
+ /* bail out if we find any unreasonable addresses in bios map */
+ for (i=0; i<old_nr; i++)
+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
+ return -1;
+ }
+
+ /* create pointers for initial change-point information (for sorting) */
+ for (i=0; i < 2*old_nr; i++)
+ change_point[i] = &change_point_list[i];
+
+ /* record all known change-points (starting and ending addresses),
+ omitting those that are for empty memory regions */
+ chgidx = 0;
+ for (i=0; i < old_nr; i++) {
+ if (biosmap[i].size != 0) {
+ change_point[chgidx]->addr = biosmap[i].addr;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ }
+ }
+ chg_nr = chgidx; /* true number of change-points */
+
+ /* sort change-point list by memory addresses (low -> high) */
+ still_changing = 1;
+ while (still_changing) {
+ still_changing = 0;
+ for (i=1; i < chg_nr; i++) {
+ /* if <current_addr> > <last_addr>, swap */
+ /* or, if current=<start_addr> & last=<end_addr>, swap */
+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
+ ((change_point[i]->addr == change_point[i-1]->addr) &&
+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+ )
+ {
+ change_tmp = change_point[i];
+ change_point[i] = change_point[i-1];
+ change_point[i-1] = change_tmp;
+ still_changing=1;
+ }
+ }
+ }
+
+ /* create a new bios memory map, removing overlaps */
+ overlap_entries=0; /* number of entries in the overlap table */
+ new_bios_entry=0; /* index for creating new bios map entries */
+ last_type = 0; /* start with undefined memory type */
+ last_addr = 0; /* start with 0 as last starting address */
+ /* loop through change-points, determining affect on the new bios map */
+ for (chgidx=0; chgidx < chg_nr; chgidx++)
+ {
+ /* keep track of all overlapping bios entries */
+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+ {
+ /* add map entry to overlap list (> 1 entry implies an overlap) */
+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+ }
+ else
+ {
+ /* remove entry from list (order independent, so swap with last) */
+ for (i=0; i<overlap_entries; i++)
+ {
+ if (overlap_list[i] == change_point[chgidx]->pbios)
+ overlap_list[i] = overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /* if there are overlapping entries, decide which "type" to use */
+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+ current_type = 0;
+ for (i=0; i<overlap_entries; i++)
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ /* continue building up new bios map based on this information */
+ if (current_type != last_type) {
+ if (last_type != 0) {
+ new_bios[new_bios_entry].size =
+ change_point[chgidx]->addr - last_addr;
+ /* move forward only if the new size was non-zero */
+ if (new_bios[new_bios_entry].size != 0)
+ if (++new_bios_entry >= E820MAX)
+ break; /* no more space left for new bios entries */
+ }
+ if (current_type != 0) {
+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+ new_bios[new_bios_entry].type = current_type;
+ last_addr=change_point[chgidx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+ new_nr = new_bios_entry; /* retain count for new bios entries */
+
+ /* copy new bios mapping into original location */
+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+ *pnr_map = new_nr;
+
+ return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
+{
+#ifndef CONFIG_XEN
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_map < 2)
+ return -1;
+#else
+ BUG_ON(nr_map < 1);
+#endif
+
+ do {
+ u64 start = biosmap->addr;
+ u64 size = biosmap->size;
+ u64 end = start + size;
+ u32 type = biosmap->type;
+
+ /* Overflow in 64 bits? Ignore the memory map. */
+ if (start > end)
+ return -1;
+
+ add_memory_region(start, size, type);
+ } while (biosmap++, --nr_map);
+
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain()) {
+ struct xen_memory_map memmap;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
+
+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
+ BUG();
+ machine_e820.nr_map = memmap.nr_entries;
+ } else
+ machine_e820 = e820;
+#endif
+
+ return 0;
+}
+
+/*
+ * Find the highest page frame number we have available
+ */
+void __init propagate_e820_map(void)
+{
+ int i;
+
+ max_pfn = 0;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ unsigned long start, end;
+ /* RAM? */
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ start = PFN_UP(e820.map[i].addr);
+ end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+ if (start >= end)
+ continue;
+ if (end > max_pfn)
+ max_pfn = end;
+ memory_present(0, start, end);
+ }
+}
+
+/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ unsigned long curr_pfn, last_pfn, size;
+ /*
+ * Reserve usable low memory
+ */
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ /*
+ * We are rounding up the start address of usable memory:
+ */
+ curr_pfn = PFN_UP(e820.map[i].addr);
+ if (curr_pfn >= max_low_pfn)
+ continue;
+ /*
+ * ... and at the end of the usable range downwards:
+ */
+ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+
+#ifdef CONFIG_XEN
+ /*
+ * Truncate to the number of actual pages currently
+ * present.
+ */
+ if (last_pfn > xen_start_info->nr_pages)
+ last_pfn = xen_start_info->nr_pages;
+#endif
+
+ if (last_pfn > max_low_pfn)
+ last_pfn = max_low_pfn;
+
+ /*
+ * .. finally, did all the rounding and playing
+ * around just make the area go away?
+ */
+ if (last_pfn <= curr_pfn)
+ continue;
+
+ size = last_pfn - curr_pfn;
+ free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
+ }
+}
+
+void __init e820_register_memory(void)
+{
+ unsigned long gapstart, gapsize, round;
+ unsigned long long last;
+ int i;
+
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain()) {
+ struct xen_memory_map memmap;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
+
+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
+ BUG();
+ machine_e820.nr_map = memmap.nr_entries;
+ }
+ else
+ machine_e820 = e820;
+#define e820 machine_e820
+#endif
+
+ /*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.
+ */
+ last = 0x100000000ull;
+ gapstart = 0x10000000;
+ gapsize = 0x400000;
+ i = e820.nr_map;
+ while (--i >= 0) {
+ unsigned long long start = e820.map[i].addr;
+ unsigned long long end = start + e820.map[i].size;
+
+ /*
+ * Since "last" is at most 4GB, we know we'll
+ * fit in 32 bits if this condition is true
+ */
+ if (last > end) {
+ unsigned long gap = last - end;
+
+ if (gap > gapsize) {
+ gapsize = gap;
+ gapstart = end;
+ }
+ }
+ if (start < last)
+ last = start;
+ }
+#undef e820
+
+ /*
+ * See how much we want to round up: start off with
+ * rounding to the next 1MB area.
+ */
+ round = 0x100000;
+ while ((gapsize >> 4) > round)
+ round += round;
+ /* Fun with two's complement */
+ pci_mem_start = (gapstart + round) & -round;
+
+ printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
+ pci_mem_start, gapstart, gapsize);
+}
+
+void __init print_memory_map(char *who)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ printk(" %s: %016Lx - %016Lx ", who,
+ e820.map[i].addr,
+ e820.map[i].addr + e820.map[i].size);
+ switch (e820.map[i].type) {
+ case E820_RAM: printk("(usable)\n");
+ break;
+ case E820_RESERVED:
+ printk("(reserved)\n");
+ break;
+ case E820_ACPI:
+ printk("(ACPI data)\n");
+ break;
+ case E820_NVS:
+ printk("(ACPI NVS)\n");
+ break;
+ default: printk("type %u\n", e820.map[i].type);
+ break;
+ }
+ }
+}
+
+void __init limit_regions(unsigned long long size)
+{
+ unsigned long long current_addr = 0;
+ int i;
+
+ print_memory_map("limit_regions start");
+ for (i = 0; i < e820.nr_map; i++) {
+ current_addr = e820.map[i].addr + e820.map[i].size;
+ if (current_addr < size)
+ continue;
+
+ if (e820.map[i].type != E820_RAM)
+ continue;
+
+ if (e820.map[i].addr >= size) {
+ /*
+ * This region starts past the end of the
+ * requested size, skip it completely.
+ */
+ e820.nr_map = i;
+ } else {
+ e820.nr_map = i + 1;
+ e820.map[i].size -= current_addr - size;
+ }
+ print_memory_map("limit_regions endfor");
+ return;
+ }
+#ifdef CONFIG_XEN
+ if (current_addr < size) {
+ /*
+ * The e820 map finished before our requested size so
+ * extend the final entry to the requested address.
+ */
+ --i;
+ if (e820.map[i].type == E820_RAM)
+ e820.map[i].size -= current_addr - size;
+ else
+ add_memory_region(current_addr, size - current_addr, E820_RAM);
+ }
+#endif
+ print_memory_map("limit_regions endfunc");
+}
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+ int i;
+
+#ifndef CONFIG_XEN
+ for (i = 0; i < e820.nr_map; i++) {
+ const struct e820entry *ei = &e820.map[i];
+#else
+ if (!is_initial_xendomain())
+ return 0;
+ for (i = 0; i < machine_e820.nr_map; ++i) {
+ const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+ if (type && ei->type != type)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+ /*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init
+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
+{
+ u64 start = s;
+ u64 end = e;
+ int i;
+
+#ifndef CONFIG_XEN
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+#else
+ if (!is_initial_xendomain())
+ return 0;
+ for (i = 0; i < machine_e820.nr_map; ++i) {
+ const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /* if start is now at or beyond end, we're done, full
+ * coverage */
+ if (start >= end)
+ return 1; /* we're done */
+ }
+ return 0;
+}
+
+static int __init parse_memmap(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size before original memory map is
+ * reset.
+ */
+ propagate_e820_map();
+ saved_max_pfn = max_pfn;
+#endif
+ e820.nr_map = 0;
+ user_defined_memmap = 1;
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+ * that size. exactmap can be used to specify
+ * the exact map. mem=number can be used to
+ * trim the existing memory map.
+ */
+ unsigned long long start_at, mem_size;
+
+ mem_size = memparse(arg, &arg);
+ if (*arg == '@') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RAM);
+ } else if (*arg == '#') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_ACPI);
+ } else if (*arg == '$') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RESERVED);
+ } else {
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
+ }
+ }
+ return 0;
+}
+early_param("memmap", parse_memmap);
+
+#ifndef CONFIG_XEN
+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ int i;
+
+ BUG_ON(old_type == new_type);
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 final_start, final_end;
+ if (ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start && ei->size <= size) {
+ ei->type = new_type;
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+ add_memory_region(final_start, final_end - final_start,
+ new_type);
+ }
+}
+
+void __init update_e820(void)
+{
+ u8 nr_map;
+
+ nr_map = e820.nr_map;
+ if (sanitize_e820_map(e820.map, &nr_map))
+ return;
+ e820.nr_map = nr_map;
+ printk(KERN_INFO "modified physical RAM map:\n");
+ print_memory_map("modified");
+}
+#endif
--- /dev/null
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ * Getting sanitize_e820_map() in sync with i386 version by applying change:
+ * - Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/pfn.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <xen/interface/memory.h>
+
+struct e820map e820 __initdata;
+#ifdef CONFIG_XEN
+struct e820map machine_e820;
+#endif
+
+/*
+ * PFN of last memory page.
+ */
+unsigned long end_pfn;
+
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_pfn_mapped;
+
+/*
+ * Last pfn which the user wants to use.
+ */
+static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
+
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+ unsigned long start, end;
+ char name[16];
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+#ifndef CONFIG_XEN
+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
+#ifdef CONFIG_X86_TRAMPOLINE
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#endif
+ {}
+};
+
+void __init reserve_early(unsigned long start, unsigned long end, char *name)
+{
+ int i;
+ struct early_res *r;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (end > r->start && start < r->end)
+ panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
+ start, end - 1, name?name:"", r->start, r->end - 1, r->name);
+ }
+ if (i >= MAX_EARLY_RES)
+ panic("Too many early reservations");
+ r = &early_res[i];
+ r->start = start;
+ r->end = end;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+}
+
+void __init free_early(unsigned long start, unsigned long end)
+{
+ struct early_res *r;
+ int i, j;
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (start == r->start && end == r->end)
+ break;
+ }
+ if (i >= MAX_EARLY_RES || !early_res[i].end)
+ panic("free_early on not reserved area: %lx-%lx!", start, end);
+
+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ ;
+
+ memmove(&early_res[i], &early_res[i + 1],
+ (j - 1 - i) * sizeof(struct early_res));
+
+ early_res[j - 1].end = 0;
+}
+
+void __init early_res_to_bootmem(unsigned long start, unsigned long end)
+{
+ int i;
+ unsigned long final_start, final_end;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ final_start = max(start, r->start);
+ final_end = min(end, r->end);
+ if (final_start >= final_end)
+ continue;
+ printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
+ final_start, final_end - 1, r->name);
+ reserve_bootmem_generic(final_start, final_end - final_start);
+ }
+}
+
+/* Check for already reserved areas */
+static inline int __init
+bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
+{
+ int i;
+ unsigned long addr = *addrp, last;
+ int changed = 0;
+again:
+ last = addr + size;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ if (last >= r->start && addr < r->end) {
+ *addrp = addr = round_up(r->end, align);
+ changed = 1;
+ goto again;
+ }
+ }
+ return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init
+bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
+{
+ int i;
+ unsigned long addr = *addrp, last;
+ unsigned long size = *sizep;
+ int changed = 0;
+again:
+ last = addr + size;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ if (last > r->start && addr < r->start) {
+ size = r->start - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last > r->end && addr < r->end) {
+ addr = round_up(r->end, align);
+ size = last - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last <= r->end && addr >= r->start) {
+ (*sizep)++;
+ return 0;
+ }
+ }
+ if (changed) {
+ *addrp = addr;
+ *sizep = size;
+ }
+ return changed;
+}
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+ int i;
+
+#ifndef CONFIG_XEN
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+#else
+ if (!is_initial_xendomain())
+ return 0;
+ for (i = 0; i < machine_e820.nr_map; i++) {
+ const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+ if (type && ei->type != type)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(unsigned long start, unsigned long end,
+ unsigned type)
+{
+ int i;
+
+#ifndef CONFIG_XEN
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+#else
+ if (!is_initial_xendomain())
+ return 0;
+ for (i = 0; i < machine_e820.nr_map; i++) {
+ const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /*
+ * if start is now at or beyond end, we're done, full
+ * coverage
+ */
+ if (start >= end)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+unsigned long __init find_e820_area(unsigned long start, unsigned long end,
+ unsigned long size, unsigned long align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long addr, last;
+ unsigned long ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+ ;
+ last = addr + size;
+ if (last > ei_last)
+ continue;
+ if (last > end)
+ continue;
+ return addr;
+ }
+ return -1UL;
+}
+
+/*
+ * Find next free range after *start
+ */
+unsigned long __init find_e820_area_size(unsigned long start,
+ unsigned long *sizep,
+ unsigned long align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long addr, last;
+ unsigned long ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ *sizep = ei_last - addr;
+ while (bad_addr_size(&addr, sizep, align) &&
+ addr + *sizep <= ei_last)
+ ;
+ last = addr + *sizep;
+ if (last > ei_last)
+ continue;
+ return addr;
+ }
+ return -1UL;
+
+}
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+ unsigned long end_pfn;
+
+ end_pfn = find_max_pfn_with_active_regions();
+
+ if (end_pfn > max_pfn_mapped)
+ max_pfn_mapped = end_pfn;
+ if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
+ max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
+ if (end_pfn > end_user_pfn)
+ end_pfn = end_user_pfn;
+ if (end_pfn > max_pfn_mapped)
+ end_pfn = max_pfn_mapped;
+
+ printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
+ return end_pfn;
+}
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
+{
+ int i;
+ struct resource *res;
+
+ res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
+ for (i = 0; i < nr_map; i++) {
+ switch (e820[i].type) {
+ case E820_RAM: res->name = "System RAM"; break;
+ case E820_ACPI: res->name = "ACPI Tables"; break;
+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
+ default: res->name = "reserved";
+ }
+ res->start = e820[i].addr;
+ res->end = res->start + e820[i].size - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ res++;
+ }
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for software
+ * suspend and suspend to RAM.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+ int i;
+ unsigned long paddr;
+
+ paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (paddr < ei->addr)
+ register_nosave_region(PFN_DOWN(paddr),
+ PFN_UP(ei->addr));
+
+ paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (ei->type != E820_RAM)
+ register_nosave_region(PFN_UP(ei->addr),
+ PFN_DOWN(paddr));
+
+ if (paddr >= (end_pfn << PAGE_SHIFT))
+ break;
+ }
+}
+#endif
+
+/*
+ * Finds an active region in the address range from start_pfn to end_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+static int __init e820_find_active_region(const struct e820entry *ei,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn)
+{
+ *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Skip map entries smaller than a page */
+ if (*ei_startpfn >= *ei_endpfn)
+ return 0;
+
+ /* Check if max_pfn_mapped should be updated */
+ if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
+ max_pfn_mapped = *ei_endpfn;
+
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+ *ei_startpfn >= end_pfn)
+ return 0;
+
+ /* Check for overlaps */
+ if (*ei_startpfn < start_pfn)
+ *ei_startpfn = start_pfn;
+ if (*ei_endpfn > end_pfn)
+ *ei_endpfn = end_pfn;
+
+ /* Obey end_user_pfn to save on memmap */
+ if (*ei_startpfn >= end_user_pfn)
+ return 0;
+ if (*ei_endpfn > end_user_pfn)
+ *ei_endpfn = end_user_pfn;
+
+ return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long ei_startpfn;
+ unsigned long ei_endpfn;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++)
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, end_pfn,
+ &ei_startpfn, &ei_endpfn))
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+void __init add_memory_region(unsigned long start, unsigned long size, int type)
+{
+ int x = e820.nr_map;
+
+ if (x == E820MAX) {
+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ return;
+ }
+
+ e820.map[x].addr = start;
+ e820.map[x].size = size;
+ e820.map[x].type = type;
+ e820.nr_map++;
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = end >> PAGE_SHIFT;
+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, end_pfn,
+ &ei_startpfn, &ei_endpfn))
+ ram += ei_endpfn - ei_startpfn;
+ }
+ return end - start - (ram << PAGE_SHIFT);
+}
+
+static void __init e820_print_map(char *who)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+ (unsigned long long) e820.map[i].addr,
+ (unsigned long long)
+ (e820.map[i].addr + e820.map[i].size));
+ switch (e820.map[i].type) {
+ case E820_RAM:
+ printk(KERN_CONT "(usable)\n");
+ break;
+ case E820_RESERVED:
+ printk(KERN_CONT "(reserved)\n");
+ break;
+ case E820_ACPI:
+ printk(KERN_CONT "(ACPI data)\n");
+ break;
+ case E820_NVS:
+ printk(KERN_CONT "(ACPI NVS)\n");
+ break;
+ default:
+ printk(KERN_CONT "type %u\n", e820.map[i].type);
+ break;
+ }
+ }
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
+{
+ struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+ };
+ static struct change_member change_point_list[2*E820MAX] __initdata;
+ static struct change_member *change_point[2*E820MAX] __initdata;
+ static struct e820entry *overlap_list[E820MAX] __initdata;
+ static struct e820entry new_bios[E820MAX] __initdata;
+ struct change_member *change_tmp;
+ unsigned long current_type, last_type;
+ unsigned long long last_addr;
+ int chgidx, still_changing;
+ int overlap_entries;
+ int new_bios_entry;
+ int old_nr, new_nr, chg_nr;
+ int i;
+
+ /*
+ Visually we're performing the following
+ (1,2,3,4 = memory types)...
+
+ Sample memory map (w/overlaps):
+ ____22__________________
+ ______________________4_
+ ____1111________________
+ _44_____________________
+ 11111111________________
+ ____________________33__
+ ___________44___________
+ __________33333_________
+ ______________22________
+ ___________________2222_
+ _________111111111______
+ _____________________11_
+ _________________4______
+
+ Sanitized equivalent (no overlap):
+ 1_______________________
+ _44_____________________
+ ___1____________________
+ ____22__________________
+ ______11________________
+ _________1______________
+ __________3_____________
+ ___________44___________
+ _____________33_________
+ _______________2________
+ ________________1_______
+ _________________4______
+ ___________________2____
+ ____________________33__
+ ______________________4_
+ */
+
+ /* if there's only one memory region, don't bother */
+ if (*pnr_map < 2)
+ return -1;
+
+ old_nr = *pnr_map;
+
+ /* bail out if we find any unreasonable addresses in bios map */
+ for (i = 0; i < old_nr; i++)
+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+ return -1;
+
+ /* create pointers for initial change-point information (for sorting) */
+ for (i = 0; i < 2 * old_nr; i++)
+ change_point[i] = &change_point_list[i];
+
+ /* record all known change-points (starting and ending addresses),
+ omitting those that are for empty memory regions */
+ chgidx = 0;
+ for (i = 0; i < old_nr; i++) {
+ if (biosmap[i].size != 0) {
+ change_point[chgidx]->addr = biosmap[i].addr;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ change_point[chgidx]->addr = biosmap[i].addr +
+ biosmap[i].size;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ }
+ }
+ chg_nr = chgidx;
+
+ /* sort change-point list by memory addresses (low -> high) */
+ still_changing = 1;
+ while (still_changing) {
+ still_changing = 0;
+ for (i = 1; i < chg_nr; i++) {
+ unsigned long long curaddr, lastaddr;
+ unsigned long long curpbaddr, lastpbaddr;
+
+ curaddr = change_point[i]->addr;
+ lastaddr = change_point[i - 1]->addr;
+ curpbaddr = change_point[i]->pbios->addr;
+ lastpbaddr = change_point[i - 1]->pbios->addr;
+
+ /*
+ * swap entries, when:
+ *
+ * curaddr > lastaddr or
+ * curaddr == lastaddr and curaddr == curpbaddr and
+ * lastaddr != lastpbaddr
+ */
+ if (curaddr < lastaddr ||
+ (curaddr == lastaddr && curaddr == curpbaddr &&
+ lastaddr != lastpbaddr)) {
+ change_tmp = change_point[i];
+ change_point[i] = change_point[i-1];
+ change_point[i-1] = change_tmp;
+ still_changing = 1;
+ }
+ }
+ }
+
+ /* create a new bios memory map, removing overlaps */
+ overlap_entries = 0; /* number of entries in the overlap table */
+ new_bios_entry = 0; /* index for creating new bios map entries */
+ last_type = 0; /* start with undefined memory type */
+ last_addr = 0; /* start with 0 as last starting address */
+
+ /* loop through change-points, determining affect on the new bios map */
+ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+ /* keep track of all overlapping bios entries */
+ if (change_point[chgidx]->addr ==
+ change_point[chgidx]->pbios->addr) {
+ /*
+ * add map entry to overlap list (> 1 entry
+ * implies an overlap)
+ */
+ overlap_list[overlap_entries++] =
+ change_point[chgidx]->pbios;
+ } else {
+ /*
+ * remove entry from list (order independent,
+ * so swap with last)
+ */
+ for (i = 0; i < overlap_entries; i++) {
+ if (overlap_list[i] ==
+ change_point[chgidx]->pbios)
+ overlap_list[i] =
+ overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /*
+ * if there are overlapping entries, decide which
+ * "type" to use (larger value takes precedence --
+ * 1=usable, 2,3,4,4+=unusable)
+ */
+ current_type = 0;
+ for (i = 0; i < overlap_entries; i++)
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ /*
+ * continue building up new bios map based on this
+ * information
+ */
+ if (current_type != last_type) {
+ if (last_type != 0) {
+ new_bios[new_bios_entry].size =
+ change_point[chgidx]->addr - last_addr;
+ /*
+ * move forward only if the new size
+ * was non-zero
+ */
+ if (new_bios[new_bios_entry].size != 0)
+ /*
+ * no more space left for new
+ * bios entries ?
+ */
+ if (++new_bios_entry >= E820MAX)
+ break;
+ }
+ if (current_type != 0) {
+ new_bios[new_bios_entry].addr =
+ change_point[chgidx]->addr;
+ new_bios[new_bios_entry].type = current_type;
+ last_addr = change_point[chgidx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+ /* retain count for new bios entries */
+ new_nr = new_bios_entry;
+
+ /* copy new bios mapping into original location */
+ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+ *pnr_map = new_nr;
+
+ return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ */
+static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
+{
+#ifndef CONFIG_XEN
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_map < 2)
+ return -1;
+#else
+ BUG_ON(nr_map < 1);
+#endif
+
+ do {
+ u64 start = biosmap->addr;
+ u64 size = biosmap->size;
+ u64 end = start + size;
+ u32 type = biosmap->type;
+
+ /* Overflow in 64 bits? Ignore the memory map. */
+ if (start > end)
+ return -1;
+
+ add_memory_region(start, size, type);
+ } while (biosmap++, --nr_map);
+
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain()) {
+ struct xen_memory_map memmap;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
+
+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
+ BUG();
+ machine_e820.nr_map = memmap.nr_entries;
+ } else
+ machine_e820 = e820;
+#endif
+
+ return 0;
+}
+
+static void early_panic(char *msg)
+{
+ early_printk(msg);
+ panic(msg);
+}
+
+/* We're not void only for x86 32-bit compat */
+char * __init machine_specific_memory_setup(void)
+{
+#ifndef CONFIG_XEN
+ char *who = "BIOS-e820";
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+ * Otherwise fake a memory map; one section from 0k->640k,
+ * the next section from 1mb->appropriate_mem_k
+ */
+ sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
+ if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
+ early_panic("Cannot find a valid memory map");
+#else /* CONFIG_XEN */
+ char *who = "Xen";
+ int rc;
+ struct xen_memory_map memmap;
+ /*
+ * This is rather large for a stack variable but this early in
+ * the boot process we know we have plenty slack space.
+ */
+ struct e820entry map[E820MAX];
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if ( rc == -ENOSYS ) {
+ memmap.nr_entries = 1;
+ map[0].addr = 0ULL;
+ map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
+ /* 8MB slack (to balance backend allocations). */
+ map[0].size += 8 << 20;
+ map[0].type = E820_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
+
+ sanitize_e820_map(map, (char *)&memmap.nr_entries);
+
+ if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
+ early_panic("Cannot find a valid memory map");
+#endif
+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ e820_print_map(who);
+
+ /* In case someone cares... */
+ return who;
+}
+
+static int __init parse_memopt(char *p)
+{
+ int i;
+ unsigned long current_end;
+ unsigned long end;
+
+ if (!p)
+ return -EINVAL;
+ end_user_pfn = memparse(p, &p);
+ end_user_pfn >>= PAGE_SHIFT;
+
+ end = end_user_pfn<<PAGE_SHIFT;
+ i = e820.nr_map-1;
+ current_end = e820.map[i].addr + e820.map[i].size;
+
+ if (current_end < end) {
+ /*
+ * The e820 map ends before our requested size so
+ * extend the final entry to the requested address.
+ */
+ if (e820.map[i].type == E820_RAM)
+ e820.map[i].size = end - e820.map[i].addr;
+ else
+ add_memory_region(current_end, end - current_end, E820_RAM);
+ }
+
+ return 0;
+}
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
+
+static int __init parse_memmap_opt(char *p)
+{
+ char *oldp;
+ unsigned long long start_at, mem_size;
+
+ if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * If we are doing a crash dump, we still need to know
+ * the real mem size before original memory map is
+ * reset.
+ */
+ e820_register_active_regions(0, 0, -1UL);
+ saved_max_pfn = e820_end_of_ram();
+ remove_all_active_ranges();
+#endif
+ max_pfn_mapped = 0;
+ e820.nr_map = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ userdef = 1;
+ if (*p == '@') {
+ start_at = memparse(p+1, &p);
+ add_memory_region(start_at, mem_size, E820_RAM);
+ } else if (*p == '#') {
+ start_at = memparse(p+1, &p);
+ add_memory_region(start_at, mem_size, E820_ACPI);
+ } else if (*p == '$') {
+ start_at = memparse(p+1, &p);
+ add_memory_region(start_at, mem_size, E820_RESERVED);
+ } else {
+ end_user_pfn = (mem_size >> PAGE_SHIFT);
+ }
+ return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void __init finish_e820_parsing(void)
+{
+ if (userdef) {
+ char nr = e820.nr_map;
+
+ if (sanitize_e820_map(e820.map, &nr) < 0)
+ early_panic("Invalid user supplied memory map");
+ e820.nr_map = nr;
+
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ e820_print_map("user");
+ }
+}
+
+#ifndef CONFIG_XEN
+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ int i;
+
+ BUG_ON(old_type == new_type);
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 final_start, final_end;
+ if (ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start && ei->size <= size) {
+ ei->type = new_type;
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+ add_memory_region(final_start, final_end - final_start,
+ new_type);
+ }
+}
+
+void __init update_e820(void)
+{
+ u8 nr_map;
+
+ nr_map = e820.nr_map;
+ if (sanitize_e820_map(e820.map, &nr_map))
+ return;
+ e820.nr_map = nr_map;
+ printk(KERN_INFO "modified physical RAM map:\n");
+ e820_print_map("modified");
+}
+#endif
+
+unsigned long pci_mem_start = 0xaeedbabe;
+EXPORT_SYMBOL(pci_mem_start);
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space. We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(struct e820entry *e820, int nr_map)
+{
+ unsigned long gapstart, gapsize, round;
+ unsigned long last;
+ int i;
+ int found = 0;
+
+ last = 0x100000000ull;
+ gapstart = 0x10000000;
+ gapsize = 0x400000;
+ i = nr_map;
+ while (--i >= 0) {
+ unsigned long long start = e820[i].addr;
+ unsigned long long end = start + e820[i].size;
+
+ /*
+ * Since "last" is at most 4GB, we know we'll
+ * fit in 32 bits if this condition is true
+ */
+ if (last > end) {
+ unsigned long gap = last - end;
+
+ if (gap > gapsize) {
+ gapsize = gap;
+ gapstart = end;
+ found = 1;
+ }
+ }
+ if (start < last)
+ last = start;
+ }
+
+ if (!found) {
+ gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+ "address range\n"
+ KERN_ERR "PCI: Unassigned devices with 32bit resource "
+ "registers may break!\n");
+ }
+
+ /*
+ * See how much we want to round up: start off with
+ * rounding to the next 1MB area.
+ */
+ round = 0x100000;
+ while ((gapsize >> 4) > round)
+ round += round;
+ /* Fun with two's complement */
+ pci_mem_start = (gapstart + round) & -round;
+
+ printk(KERN_INFO
+ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+ pci_mem_start, gapstart, gapsize);
+}
+
+int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
+{
+ int i;
+
+ if (slot < 0 || slot >= e820.nr_map)
+ return -1;
+ for (i = slot; i < e820.nr_map; i++) {
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ break;
+ }
+ if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
+ return -1;
+ *addr = e820.map[i].addr;
+ *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
+ max_pfn << PAGE_SHIFT) - *addr;
+ return i + 1;
+}
--- /dev/null
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/screen_info.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/fcntl.h>
+#include <asm/setup.h>
+
+/* Simple VGA output */
+#define VGABASE (__ISA_IO_base + 0xb8000)
+
+#ifndef CONFIG_XEN
+static int max_ypos = 25, max_xpos = 80;
+static int current_ypos = 25, current_xpos;
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+ char c;
+ int i, k, j;
+
+ while ((c = *str++) != '\0' && n-- > 0) {
+ if (current_ypos >= max_ypos) {
+ /* scroll 1 line up */
+ for (k = 1, j = 0; k < max_ypos; k++, j++) {
+ for (i = 0; i < max_xpos; i++) {
+ writew(readw(VGABASE+2*(max_xpos*k+i)),
+ VGABASE + 2*(max_xpos*j + i));
+ }
+ }
+ for (i = 0; i < max_xpos; i++)
+ writew(0x720, VGABASE + 2*(max_xpos*j + i));
+ current_ypos = max_ypos-1;
+ }
+ if (c == '\n') {
+ current_xpos = 0;
+ current_ypos++;
+ } else if (c != '\r') {
+ writew(((0x7 << 8) | (unsigned short) c),
+ VGABASE + 2*(max_xpos*current_ypos +
+ current_xpos++));
+ if (current_xpos >= max_xpos) {
+ current_xpos = 0;
+ current_ypos++;
+ }
+ }
+ }
+}
+
+static struct console early_vga_console = {
+ .name = "earlyvga",
+ .write = early_vga_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+
+static int early_serial_base = 0x3f8; /* ttyS0 */
+
+#define XMTRDY 0x20
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+static int early_serial_putc(unsigned char ch)
+{
+ unsigned timeout = 0xffff;
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+ outb(ch, early_serial_base + TXR);
+ return timeout ? 0 : -1;
+}
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+ while (*s && n-- > 0) {
+ if (*s == '\n')
+ early_serial_putc('\r');
+ early_serial_putc(*s);
+ s++;
+ }
+}
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+ unsigned char c;
+ unsigned divisor;
+ unsigned baud = DEFAULT_BAUD;
+ char *e;
+
+ if (*s == ',')
+ ++s;
+
+ if (*s) {
+ unsigned port;
+ if (!strncmp(s, "0x", 2)) {
+ early_serial_base = simple_strtoul(s, &e, 16);
+ } else {
+ static int bases[] = { 0x3f8, 0x2f8 };
+
+ if (!strncmp(s, "ttyS", 4))
+ s += 4;
+ port = simple_strtoul(s, &e, 10);
+ if (port > 1 || s == e)
+ port = 0;
+ early_serial_base = bases[port];
+ }
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
+ }
+
+ outb(0x3, early_serial_base + LCR); /* 8n1 */
+ outb(0, early_serial_base + IER); /* no interrupt */
+ outb(0, early_serial_base + FCR); /* no fifo */
+ outb(0x3, early_serial_base + MCR); /* DTR + RTS */
+
+ if (*s) {
+ baud = simple_strtoul(s, &e, 0);
+ if (baud == 0 || s == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ divisor = 115200 / baud;
+ c = inb(early_serial_base + LCR);
+ outb(c | DLAB, early_serial_base + LCR);
+ outb(divisor & 0xff, early_serial_base + DLL);
+ outb((divisor >> 8) & 0xff, early_serial_base + DLH);
+ outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+#else /* CONFIG_XEN */
+
+static void
+early_serial_write(struct console *con, const char *s, unsigned count)
+{
+ int n;
+
+ while (count > 0) {
+ n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
+ if (n <= 0)
+ break;
+ count -= n;
+ s += n;
+ }
+}
+
+static __init void early_serial_init(char *s)
+{
+}
+
+/*
+ * No early VGA console on Xen, as we do not have convenient ISA-space
+ * mappings. Someone should fix this for domain 0. For now, use fake serial.
+ */
+#define early_vga_console early_serial_console
+#define xenboot_console early_serial_console
+
+#endif
+
+static struct console early_serial_console = {
+ .name = "earlyser",
+ .write = early_serial_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/* Console interface to a host file on AMD's SimNow! */
+
+static int simnow_fd;
+
+enum {
+ MAGIC1 = 0xBACCD00A,
+ MAGIC2 = 0xCA110000,
+ XOPEN = 5,
+ XWRITE = 4,
+};
+
+static noinline long simnow(long cmd, long a, long b, long c)
+{
+ long ret;
+ asm volatile("cpuid" :
+ "=a" (ret) :
+ "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
+ return ret;
+}
+
+static void __init simnow_init(char *str)
+{
+ char *fn = "klog";
+ if (*str == '=')
+ fn = ++str;
+ /* error ignored */
+ simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
+}
+
+static void simnow_write(struct console *con, const char *s, unsigned n)
+{
+ simnow(XWRITE, simnow_fd, (unsigned long)s, n);
+}
+
+static struct console simnow_console = {
+ .name = "simnow",
+ .write = simnow_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/* Direct interface for emergencies */
+static struct console *early_console = &early_vga_console;
+static int early_console_initialized;
+
+void early_printk(const char *fmt, ...)
+{
+ char buf[512];
+ int n;
+ va_list ap;
+
+ va_start(ap, fmt);
+ n = vscnprintf(buf, 512, fmt, ap);
+ early_console->write(early_console, buf, n);
+ va_end(ap);
+}
+
+static int __initdata keep_early;
+
+static int __init setup_early_printk(char *buf)
+{
+ if (!buf)
+ return 0;
+
+ if (early_console_initialized)
+ return 0;
+ early_console_initialized = 1;
+
+ if (strstr(buf, "keep"))
+ keep_early = 1;
+
+ if (!strncmp(buf, "serial", 6)) {
+ early_serial_init(buf + 6);
+ early_console = &early_serial_console;
+ } else if (!strncmp(buf, "ttyS", 4)) {
+ early_serial_init(buf);
+ early_console = &early_serial_console;
+ } else if (!strncmp(buf, "vga", 3)) {
+#ifndef CONFIG_XEN
+ && boot_params.screen_info.orig_video_isVGA == 1) {
+ max_xpos = boot_params.screen_info.orig_video_cols;
+ max_ypos = boot_params.screen_info.orig_video_lines;
+ current_ypos = boot_params.screen_info.orig_y;
+#endif
+ early_console = &early_vga_console;
+ } else if (!strncmp(buf, "simnow", 6)) {
+ simnow_init(buf + 6);
+ early_console = &simnow_console;
+ keep_early = 1;
+#ifdef CONFIG_XEN
+ } else if (!strncmp(buf, "xen", 3)) {
+ early_console = &xenboot_console;
+#endif
+ }
+
+ if (keep_early)
+ early_console->flags &= ~CON_BOOT;
+ else
+ early_console->flags |= CON_BOOT;
+ register_console(early_console);
+ return 0;
+}
+early_param("earlyprintk", setup_early_printk);
--- /dev/null
+/*
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'syscall_exit':
+ * ptrace needs to have all regs on the stack.
+ * if the order here is changed, it needs to be
+ * updated in fork.c:copy_process, signal.c:do_signal,
+ * ptrace.c and ptrace.h
+ *
+ * 0(%esp) - %ebx
+ * 4(%esp) - %ecx
+ * 8(%esp) - %edx
+ * C(%esp) - %esi
+ * 10(%esp) - %edi
+ * 14(%esp) - %ebp
+ * 18(%esp) - %eax
+ * 1C(%esp) - %ds
+ * 20(%esp) - %es
+ * 24(%esp) - %fs
+ * 28(%esp) - orig_eax
+ * 2C(%esp) - %eip
+ * 30(%esp) - %cs
+ * 34(%esp) - %eflags
+ * 38(%esp) - %oldesp
+ * 3C(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/linkage.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/percpu.h>
+#include <asm/dwarf2.h>
+#include <asm/processor-flags.h>
+#include "irq_vectors.h"
+#include <xen/interface/xen.h>
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#define nr_syscalls ((syscall_table_size)/4)
+
+/* Pseudo-eflags. */
+NMI_MASK = 0x80000000
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+#define preempt_stop(clobbers)
+#define resume_kernel restore_nocheck
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
+ jz 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+#ifdef CONFIG_VM86
+#define resume_userspace_sig check_userspace
+#else
+#define resume_userspace_sig resume_userspace
+#endif
+
+#define SAVE_ALL \
+ cld; \
+ pushl %fs; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ /*CFI_REL_OFFSET fs, 0;*/\
+ pushl %es; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ /*CFI_REL_OFFSET es, 0;*/\
+ pushl %ds; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ /*CFI_REL_OFFSET ds, 0;*/\
+ pushl %eax; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET eax, 0;\
+ pushl %ebp; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET ebp, 0;\
+ pushl %edi; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET edi, 0;\
+ pushl %esi; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET esi, 0;\
+ pushl %edx; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET edx, 0;\
+ pushl %ecx; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET ecx, 0;\
+ pushl %ebx; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET ebx, 0;\
+ movl $(__USER_DS), %edx; \
+ movl %edx, %ds; \
+ movl %edx, %es; \
+ movl $(__KERNEL_PERCPU), %edx; \
+ movl %edx, %fs
+
+#define RESTORE_INT_REGS \
+ popl %ebx; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE ebx;\
+ popl %ecx; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE ecx;\
+ popl %edx; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE edx;\
+ popl %esi; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE esi;\
+ popl %edi; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE edi;\
+ popl %ebp; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE ebp;\
+ popl %eax; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE eax
+
+#define RESTORE_REGS \
+ RESTORE_INT_REGS; \
+1: popl %ds; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ /*CFI_RESTORE ds;*/\
+2: popl %es; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ /*CFI_RESTORE es;*/\
+3: popl %fs; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ /*CFI_RESTORE fs;*/\
+.pushsection .fixup,"ax"; \
+4: movl $0,(%esp); \
+ jmp 1b; \
+5: movl $0,(%esp); \
+ jmp 2b; \
+6: movl $0,(%esp); \
+ jmp 3b; \
+.section __ex_table,"a";\
+ .align 4; \
+ .long 1b,4b; \
+ .long 2b,5b; \
+ .long 3b,6b; \
+.popsection
+
+#define RING0_INT_FRAME \
+ CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
+ CFI_DEF_CFA esp, 3*4;\
+ /*CFI_OFFSET cs, -2*4;*/\
+ CFI_OFFSET eip, -3*4
+
+#define RING0_EC_FRAME \
+ CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
+ CFI_DEF_CFA esp, 4*4;\
+ /*CFI_OFFSET cs, -2*4;*/\
+ CFI_OFFSET eip, -3*4
+
+#define RING0_PTREGS_FRAME \
+ CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
+ CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
+ CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
+ CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
+ CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
+ CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+ CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+
+ENTRY(ret_from_fork)
+ CFI_STARTPROC
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ pushl $0x0202 # Reset kernel eflags
+ CFI_ADJUST_CFA_OFFSET 4
+ popfl
+ CFI_ADJUST_CFA_OFFSET -4
+ jmp syscall_exit
+ CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+ # userspace resumption stub bypassing syscall exit tracing
+ ALIGN
+ RING0_PTREGS_FRAME
+ret_from_exception:
+ preempt_stop(CLBR_ANY)
+ret_from_intr:
+ GET_THREAD_INFO(%ebp)
+check_userspace:
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+ cmpl $USER_RPL, %eax
+ jb resume_kernel # not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
+ # int/exception return?
+ jne work_pending
+ jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
+ jnz restore_nocheck
+need_resched:
+ movl TI_flags(%ebp), %ecx # need_resched set ?
+ testb $_TIF_NEED_RESCHED, %cl
+ jz restore_all
+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ jz restore_all
+ call preempt_schedule_irq
+ jmp need_resched
+END(resume_kernel)
+#endif
+ CFI_ENDPROC
+
+ .macro test_tif ti_reg # system call tracing in operation / emulation
+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
+ .endm
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
+
+ # sysenter call handler stub
+ENTRY(ia32_sysenter_target)
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 0
+ CFI_REGISTER esp, ebp
+ movl SYSENTER_stack_sp0(%esp),%esp
+sysenter_past_esp:
+ /*
+ * Interrupts are disabled here, but we can't trace it until
+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
+ * we immediately enable interrupts at that point anyway.
+ */
+ pushl $(__USER_DS)
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ss, 0*/
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esp, 0
+ pushfl
+ orl $X86_EFLAGS_IF, (%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $(__USER_CS)
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET cs, 0*/
+ /*
+ * Push current_thread_info()->sysenter_return to the stack.
+ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+ * pushed above; +8 corresponds to copy_thread's esp0 setting.
+ */
+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eip, 0
+
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+ cmpl $__PAGE_OFFSET-3,%ebp
+ jae syscall_fault
+1: movl (%ebp),%ebp
+ movl %ebp,PT_EBP(%esp)
+.section __ex_table,"a"
+ .align 4
+ .long 1b,syscall_fault
+.previous
+
+ GET_THREAD_INFO(%ebp)
+ test_tif %ebp
+ jnz syscall_trace_entry
+ cmpl $(nr_syscalls), %eax
+ jae syscall_badsys
+ call *sys_call_table(,%eax,4)
+ movl %eax,PT_EAX(%esp)
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testw $_TIF_ALLWORK_MASK, %cx
+ jne syscall_exit_work
+/* if something modifies registers it must also disable sysexit */
+ movl PT_EIP(%esp), %edx
+ movl PT_OLDESP(%esp), %ecx
+ xorl %ebp,%ebp
+ TRACE_IRQS_ON
+1: mov PT_FS(%esp), %fs
+ ENABLE_INTERRUPTS_SYSCALL_RET
+ CFI_ENDPROC
+.pushsection .fixup,"ax"
+2: movl $0,PT_FS(%esp)
+ jmp 1b
+.section __ex_table,"a"
+ .align 4
+ .long 1b,2b
+.popsection
+ENDPROC(ia32_sysenter_target)
+
+ # pv sysenter call handler stub
+ENTRY(ia32pv_sysenter_target)
+ RING0_INT_FRAME
+ movl $__USER_DS,16(%esp)
+ movl %ebp,12(%esp)
+ movl $__USER_CS,4(%esp)
+ addl $4,%esp
+ CFI_ADJUST_CFA_OFFSET -4
+ /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+ cmpl $__PAGE_OFFSET-3,%ebp
+ jae syscall_fault
+1: movl (%ebp),%ebp
+.section __ex_table,"a"
+ .align 4
+ .long 1b,syscall_fault
+.previous
+ /* fall through */
+ CFI_ENDPROC
+ENDPROC(ia32pv_sysenter_target)
+
+ # system call handler stub
+ENTRY(system_call)
+ RING0_INT_FRAME # can't unwind into user space anyway
+ pushl %eax # save orig_eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+ test_tif %ebp
+ jnz syscall_trace_entry
+ cmpl $(nr_syscalls), %eax
+ jae syscall_badsys
+syscall_call:
+ call *sys_call_table(,%eax,4)
+ movl %eax,PT_EAX(%esp) # store the return value
+syscall_exit:
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
+ jz no_singlestep
+ orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
+ movl TI_flags(%ebp), %ecx
+ testw $_TIF_ALLWORK_MASK, %cx # current->work
+ jne syscall_exit_work
+
+restore_all:
+#ifndef CONFIG_XEN
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ # are returning to the kernel.
+ # See comments in process.c:copy_thread() for details.
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ CFI_REMEMBER_STATE
+ je ldt_ss # returning to user-space with LDT SS
+restore_nocheck:
+#else
+restore_nocheck:
+ movl PT_EFLAGS(%esp), %eax
+ testl $(X86_EFLAGS_VM|NMI_MASK), %eax
+ CFI_REMEMBER_STATE
+ jnz hypervisor_iret
+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
+ GET_VCPU_INFO
+ andb evtchn_upcall_mask(%esi),%al
+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
+ CFI_REMEMBER_STATE
+ jnz restore_all_enable_events # != 0 => enable event delivery
+#endif
+ TRACE_IRQS_IRET
+restore_nocheck_notrace:
+ RESTORE_REGS
+ addl $4, %esp # skip orig_eax/error_code
+ CFI_ADJUST_CFA_OFFSET -4
+irq_return:
+ INTERRUPT_RETURN
+.section .fixup,"ax"
+ENTRY(iret_exc)
+ pushl $0 # no error code
+ pushl $do_iret_error
+ jmp error_code
+.previous
+.section __ex_table,"a"
+ .align 4
+ .long irq_return,iret_exc
+.previous
+
+ CFI_RESTORE_STATE
+#ifndef CONFIG_XEN
+ldt_ss:
+ larl PT_OLDSS(%esp), %eax
+ jnz restore_nocheck
+ testl $0x00400000, %eax # returning to 32bit stack?
+ jnz restore_nocheck # allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+ /*
+ * The kernel can't run on a non-flat stack if paravirt mode
+ * is active. Rather than try to fixup the high bits of
+ * ESP, bypass this code entirely. This may break DOSemu
+ * and/or Wine support in a paravirt VM, although the option
+ * is still available to implement the setting of the high
+ * 16-bits in the INTERRUPT_RETURN paravirt-op.
+ */
+ cmpl $0, pv_info+PARAVIRT_enabled
+ jne restore_nocheck
+#endif
+
+ /* If returning to userspace with 16bit stack,
+ * try to fix the higher word of ESP, as the CPU
+ * won't restore it.
+ * This is an "official" bug of all the x86-compatible
+ * CPUs, which we can try to work around to make
+ * dosemu and wine happy. */
+ movl PT_OLDESP(%esp), %eax
+ movl %esp, %edx
+ call patch_espfix_desc
+ pushl $__ESPFIX_SS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ DISABLE_INTERRUPTS(CLBR_EAX)
+ TRACE_IRQS_OFF
+ lss (%esp), %esp
+ CFI_ADJUST_CFA_OFFSET -8
+ jmp restore_nocheck
+#else
+ ALIGN
+restore_all_enable_events:
+ TRACE_IRQS_ON
+ __ENABLE_INTERRUPTS
+scrit: /**** START OF CRITICAL REGION ****/
+ __TEST_PENDING
+ jnz 14f # process more events if necessary...
+ RESTORE_REGS
+ addl $4, %esp
+ CFI_ADJUST_CFA_OFFSET -4
+1: INTERRUPT_RETURN
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
+14: __DISABLE_INTERRUPTS
+ TRACE_IRQS_OFF
+ jmp 11f
+ecrit: /**** END OF CRITICAL REGION ****/
+
+ CFI_RESTORE_STATE
+hypervisor_iret:
+ andl $~NMI_MASK, PT_EFLAGS(%esp)
+ RESTORE_REGS
+ addl $4, %esp
+ CFI_ADJUST_CFA_OFFSET -4
+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
+#endif
+ CFI_ENDPROC
+ENDPROC(system_call)
+
+ # perform work that needs to be done immediately before resumption
+ ALIGN
+ RING0_PTREGS_FRAME # can't unwind into user space anyway
+work_pending:
+ testb $_TIF_NEED_RESCHED, %cl
+ jz work_notifysig
+work_resched:
+ call schedule
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
+ # than syscall tracing?
+ jz restore_all
+ testb $_TIF_NEED_RESCHED, %cl
+ jnz work_resched
+
+work_notifysig: # deal with pending signals and
+ # notify-resume requests
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
+ movl %esp, %eax
+ jne work_notifysig_v86 # returning to kernel-space or
+ # vm86-space
+ xorl %edx, %edx
+ call do_notify_resume
+ jmp resume_userspace_sig
+
+ ALIGN
+work_notifysig_v86:
+ pushl %ecx # save ti_flags for do_notify_resume
+ CFI_ADJUST_CFA_OFFSET 4
+ call save_v86_state # %eax contains pt_regs pointer
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ movl %eax, %esp
+#else
+ movl %esp, %eax
+#endif
+ xorl %edx, %edx
+ call do_notify_resume
+ jmp resume_userspace_sig
+END(work_pending)
+
+ # perform syscall exit tracing
+ ALIGN
+syscall_trace_entry:
+ movl $-ENOSYS,PT_EAX(%esp)
+ movl %esp, %eax
+ xorl %edx,%edx
+ call do_syscall_trace
+ cmpl $0, %eax
+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
+ # so must skip actual syscall
+ movl PT_ORIG_EAX(%esp), %eax
+ cmpl $(nr_syscalls), %eax
+ jnae syscall_call
+ jmp syscall_exit
+END(syscall_trace_entry)
+
+ # perform syscall exit tracing
+ ALIGN
+syscall_exit_work:
+ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+ jz work_pending
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
+ # schedule() instead
+ movl %esp, %eax
+ movl $1, %edx
+ call do_syscall_trace
+ jmp resume_userspace
+END(syscall_exit_work)
+ CFI_ENDPROC
+
+ RING0_INT_FRAME # can't unwind into user space anyway
+syscall_fault:
+ GET_THREAD_INFO(%ebp)
+ movl $-EFAULT,PT_EAX(%esp)
+ jmp resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+ movl $-ENOSYS,PT_EAX(%esp)
+ jmp resume_userspace
+END(syscall_badsys)
+ CFI_ENDPROC
+
+#ifndef CONFIG_XEN
+#define FIXUP_ESPFIX_STACK \
+ /* since we are on a wrong stack, we cant make it a C code :( */ \
+ PER_CPU(gdt_page, %ebx); \
+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
+ addl %esp, %eax; \
+ pushl $__KERNEL_DS; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl %eax; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ lss (%esp), %esp; \
+ CFI_ADJUST_CFA_OFFSET -8;
+#define UNWIND_ESPFIX_STACK \
+ movl %ss, %eax; \
+ /* see if on espfix stack */ \
+ cmpw $__ESPFIX_SS, %ax; \
+ jne 27f; \
+ movl $__KERNEL_DS, %eax; \
+ movl %eax, %ds; \
+ movl %eax, %es; \
+ /* switch to normal stack */ \
+ FIXUP_ESPFIX_STACK; \
+27:;
+
+/*
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+ */
+.section .rodata,"a"
+ENTRY(interrupt)
+.text
+
+ENTRY(irq_entries_start)
+ RING0_INT_FRAME
+vector=0
+.rept NR_IRQS
+ ALIGN
+ .if vector
+ CFI_ADJUST_CFA_OFFSET -4
+ .endif
+1: pushl $~(vector)
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp common_interrupt
+ .previous
+ .long 1b
+ .text
+vector=vector+1
+.endr
+END(irq_entries_start)
+
+.previous
+END(interrupt)
+.previous
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+ ALIGN
+common_interrupt:
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ movl %esp,%eax
+ call do_IRQ
+ jmp ret_from_intr
+ENDPROC(common_interrupt)
+ CFI_ENDPROC
+
+#define BUILD_INTERRUPT(name, nr) \
+ENTRY(name) \
+ RING0_INT_FRAME; \
+ pushl $~(nr); \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ SAVE_ALL; \
+ TRACE_IRQS_OFF \
+ movl %esp,%eax; \
+ call smp_##name; \
+ jmp ret_from_intr; \
+ CFI_ENDPROC; \
+ENDPROC(name)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include "entry_arch.h"
+
+#else
+#define UNWIND_ESPFIX_STACK
+#endif
+
+KPROBE_ENTRY(page_fault)
+ RING0_EC_FRAME
+ pushl $do_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ ALIGN
+error_code:
+ /* the function address is in %fs's slot on the stack */
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ cld
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
+ UNWIND_ESPFIX_STACK
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_REGISTER es, ecx*/
+ movl PT_FS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ mov %ecx, PT_FS(%esp)
+ /*CFI_REL_OFFSET fs, ES*/
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ movl %esp,%eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
+ CFI_ENDPROC
+KPROBE_END(page_fault)
+
+#ifdef CONFIG_XEN
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+#
+# The sysexit critical region is slightly different. sysexit
+# atomically removes the entire stack frame. If we interrupt in the
+# critical region we know that the entire frame is present and correct
+# so we can simply throw away the new one.
+ENTRY(hypervisor_callback)
+ RING0_INT_FRAME
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ movl PT_EIP(%esp),%eax
+ cmpl $scrit,%eax
+ jb 11f
+ cmpl $ecrit,%eax
+ jb critical_region_fixup
+ cmpl $sysexit_scrit,%eax
+ jb 11f
+ cmpl $sysexit_ecrit,%eax
+ ja 11f
+ addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame.
+11: push %esp
+ CFI_ADJUST_CFA_OFFSET 4
+ call evtchn_do_upcall
+ add $4,%esp
+ CFI_ADJUST_CFA_OFFSET -4
+ jmp ret_from_intr
+ CFI_ENDPROC
+
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+ movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped
+ cmpb $0xff,%cl # 0xff => vcpu_info critical region
+ jne 15f
+ xorl %ecx,%ecx
+15: leal (%esp,%ecx),%esi # %esi points at end of src region
+ leal PT_OLDESP(%esp),%edi # %edi points at end of dst region
+ shrl $2,%ecx # convert words to bytes
+ je 17f # skip loop if nothing to copy
+16: subl $4,%esi # pre-decrementing copy loop
+ subl $4,%edi
+ movl (%esi),%eax
+ movl %eax,(%edi)
+ loop 16b
+17: movl %edi,%esp # final %edi is top of merged stack
+ jmp 11b
+
+.section .rodata,"a"
+critical_fixup_table:
+ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
+ .byte 0xff,0xff # jnz 14f
+ .byte 0x00 # pop %ebx
+ .byte 0x04 # pop %ecx
+ .byte 0x08 # pop %edx
+ .byte 0x0c # pop %esi
+ .byte 0x10 # pop %edi
+ .byte 0x14 # pop %ebp
+ .byte 0x18 # pop %eax
+ .byte 0x1c # pop %ds
+ .byte 0x20 # pop %es
+ .byte 0x24,0x24 # pop %fs
+ .byte 0x28,0x28,0x28 # add $4,%esp
+ .byte 0x2c # iret
+ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
+ .byte 0x00,0x00 # jmp 11b
+.previous
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+# 1. Fault while reloading DS, ES, FS or GS
+# 2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(failsafe_callback)
+ pushl %eax
+ movl $1,%eax
+1: mov 4(%esp),%ds
+2: mov 8(%esp),%es
+3: mov 12(%esp),%fs
+4: mov 16(%esp),%gs
+ testl %eax,%eax
+ popl %eax
+ jz 5f
+ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET)
+ jmp iret_exc
+5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment)
+ RING0_INT_FRAME
+ pushl $0
+ SAVE_ALL
+ jmp ret_from_exception
+.section .fixup,"ax"; \
+6: xorl %eax,%eax; \
+ movl %eax,4(%esp); \
+ jmp 1b; \
+7: xorl %eax,%eax; \
+ movl %eax,8(%esp); \
+ jmp 2b; \
+8: xorl %eax,%eax; \
+ movl %eax,12(%esp); \
+ jmp 3b; \
+9: xorl %eax,%eax; \
+ movl %eax,16(%esp); \
+ jmp 4b; \
+.previous; \
+.section __ex_table,"a"; \
+ .align 4; \
+ .long 1b,6b; \
+ .long 2b,7b; \
+ .long 3b,8b; \
+ .long 4b,9b; \
+.previous
+#endif
+ CFI_ENDPROC
+
+ENTRY(coprocessor_error)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_coprocessor_error
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_simd_coprocessor_error
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+ RING0_INT_FRAME
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+#ifndef CONFIG_XEN
+ GET_CR0_INTO_EAX
+ testl $0x4, %eax # EM (math emulation bit)
+ je device_available_emulate
+ pushl $0 # temporary storage for ORIG_EIP
+ CFI_ADJUST_CFA_OFFSET 4
+ call math_emulate
+ addl $4, %esp
+ CFI_ADJUST_CFA_OFFSET -4
+ jmp ret_from_exception
+device_available_emulate:
+#endif
+ preempt_stop(CLBR_ANY)
+ call math_state_restore
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(device_not_available)
+
+#ifndef CONFIG_XEN
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label) \
+ cmpw $__KERNEL_CS,4(%esp); \
+ jne ok; \
+label: \
+ movl SYSENTER_stack_sp0+offset(%esp),%esp; \
+ CFI_DEF_CFA esp, 0; \
+ CFI_UNDEFINED eip; \
+ pushfl; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $__KERNEL_CS; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $sysenter_past_esp; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ CFI_REL_OFFSET eip, 0
+#endif /* CONFIG_XEN */
+
+KPROBE_ENTRY(debug)
+ RING0_INT_FRAME
+#ifndef CONFIG_XEN
+ cmpl $ia32_sysenter_target,(%esp)
+ jne debug_stack_correct
+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+#endif /* !CONFIG_XEN */
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ xorl %edx,%edx # error code 0
+ movl %esp,%eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
+ CFI_ENDPROC
+KPROBE_END(debug)
+
+#ifndef CONFIG_XEN
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+KPROBE_ENTRY(nmi)
+ RING0_INT_FRAME
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ je nmi_espfix_stack
+ cmpl $ia32_sysenter_target,(%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %esp,%eax
+ /* Do not access memory above the end of our stack page,
+ * it might not exist.
+ */
+ andl $(THREAD_SIZE-1),%eax
+ cmpl $(THREAD_SIZE-20),%eax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ jae nmi_stack_correct
+ cmpl $ia32_sysenter_target,12(%esp)
+ je nmi_debug_stack_check
+nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ jmp restore_nocheck_notrace
+ CFI_ENDPROC
+
+nmi_stack_fixup:
+ RING0_INT_FRAME
+ FIX_STACK(12,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+ /* We have a RING0_INT_FRAME here */
+ cmpw $__KERNEL_CS,16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug,(%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn,(%esp)
+ ja nmi_stack_correct
+ FIX_STACK(24,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_espfix_stack:
+ /* We have a RING0_INT_FRAME here.
+ *
+ * create the pointer to lss back
+ */
+ pushl %ss
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %esp
+ CFI_ADJUST_CFA_OFFSET 4
+ addw $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ .endr
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx,%edx # zero error code
+ call do_nmi
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ CFI_ADJUST_CFA_OFFSET -24
+ jmp irq_return
+ CFI_ENDPROC
+#else
+KPROBE_ENTRY(nmi)
+ RING0_INT_FRAME
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ orl $NMI_MASK, PT_EFLAGS(%esp)
+ jmp restore_all
+ CFI_ENDPROC
+#endif
+KPROBE_END(nmi)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+ iret
+.section __ex_table,"a"
+ .align 4
+ .long native_iret, iret_exc
+.previous
+END(native_iret)
+
+ENTRY(native_irq_enable_syscall_ret)
+ sti
+ sysexit
+END(native_irq_enable_syscall_ret)
+#endif
+
+KPROBE_ENTRY(int3)
+ RING0_INT_FRAME
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+ CFI_ENDPROC
+KPROBE_END(int3)
+
+ENTRY(overflow)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_overflow
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(overflow)
+
+ENTRY(bounds)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_bounds
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(bounds)
+
+ENTRY(invalid_op)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_invalid_op
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_coprocessor_segment_overrun
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+ RING0_EC_FRAME
+ pushl $do_invalid_TSS
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+ RING0_EC_FRAME
+ pushl $do_segment_not_present
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(segment_not_present)
+
+ENTRY(stack_segment)
+ RING0_EC_FRAME
+ pushl $do_stack_segment
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(stack_segment)
+
+KPROBE_ENTRY(general_protection)
+ RING0_EC_FRAME
+ pushl $do_general_protection
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+KPROBE_END(general_protection)
+
+ENTRY(alignment_check)
+ RING0_EC_FRAME
+ pushl $do_alignment_check
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(alignment_check)
+
+ENTRY(divide_error)
+ RING0_INT_FRAME
+ pushl $0 # no error code
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_divide_error
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl machine_check_vector
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(machine_check)
+#endif
+
+#ifndef CONFIG_XEN
+ENTRY(spurious_interrupt_bug)
+ RING0_INT_FRAME
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_spurious_interrupt_bug
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+#endif /* !CONFIG_XEN */
+
+ENTRY(fixup_4gb_segment)
+ RING0_EC_FRAME
+ pushl $do_fixup_4gb_segment
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+ CFI_STARTPROC
+ movl 4(%esp), %edx
+ movl (%esp), %ecx
+ leal 4(%esp), %eax
+ movl %ebx, PT_EBX(%edx)
+ xorl %ebx, %ebx
+ movl %ebx, PT_ECX(%edx)
+ movl %ebx, PT_EDX(%edx)
+ movl %esi, PT_ESI(%edx)
+ movl %edi, PT_EDI(%edx)
+ movl %ebp, PT_EBP(%edx)
+ movl %ebx, PT_EAX(%edx)
+ movl $__USER_DS, PT_DS(%edx)
+ movl $__USER_DS, PT_ES(%edx)
+ movl $__KERNEL_PERCPU, PT_FS(%edx)
+ movl %ebx, PT_ORIG_EAX(%edx)
+ movl %ecx, PT_EIP(%edx)
+ movl 12(%esp), %ecx
+ movl $__KERNEL_CS, PT_CS(%edx)
+ movl %ebx, PT_EFLAGS(%edx)
+ movl %eax, PT_OLDESP(%edx)
+ movl 8(%esp), %eax
+ movl %ecx, 8(%esp)
+ movl PT_EBX(%edx), %ebx
+ movl $__KERNEL_DS, PT_OLDSS(%edx)
+ jmpl *%eax
+ CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
+
+ENTRY(kernel_thread_helper)
+ pushl $0 # fake return address for unwinder
+ CFI_STARTPROC
+ movl %edx,%eax
+ push %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ call *%ebx
+ push %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ call do_exit
+ CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
+#include <asm/alternative-asm.h>
+
+ # pv syscall call handler stub
+ENTRY(ia32pv_cstar_target)
+ RING0_INT_FRAME
+ movl $__USER_DS,16(%esp)
+ movl %ebp,%ecx
+ movl $__USER_CS,4(%esp)
+ movl 12(%esp),%ebp
+ pushl %eax # save orig_eax
+ CFI_ADJUST_CFA_OFFSET 4
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+ cmpl $__PAGE_OFFSET-4,%ebp
+ CFI_REMEMBER_STATE
+ ja cstar_fault
+1: movl (%ebp),%ebp
+.section __ex_table,"a"
+ .align 4
+ .long 1b,cstar_fault
+.previous
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+ test_tif %ebp
+ jnz cstar_trace_entry
+ cmpl $nr_syscalls,%eax
+ jae cstar_badsys
+.Lcstar_call:
+ btl %eax,cstar_special
+ jc .Lcstar_special
+ call *cstar_call_table(,%eax,4)
+ movl %eax,PT_EAX(%esp) # store the return value
+.Lcstar_exit:
+ movl PT_ECX(%esp),%ecx
+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
+ jmp syscall_exit
+.Lcstar_special:
+ movl PT_ECX(%esp),%ecx
+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
+ jmp syscall_call
+cstar_set_tif:
+ movl $cstar_clear_tif,(%esp) # replace return address
+ LOCK_PREFIX
+ orl $_TIF_CSTAR,TI_flags(%ebp)
+ jmp *sys_call_table(,%eax,4)
+cstar_clear_tif:
+ movl %eax,PT_EAX(%esp) # store the return value
+ LOCK_PREFIX
+ andl $~_TIF_CSTAR,TI_flags(%ebp)
+ jmp .Lcstar_exit
+cstar_trace_entry:
+ movl $-ENOSYS,PT_EAX(%esp)
+ cmpl $nr_syscalls,%eax
+ jae 1f
+ btl %eax,cstar_special
+ jc .Lcstar_trace_special
+1: movl %esp,%eax
+ xorl %edx,%edx
+ LOCK_PREFIX
+ orl $_TIF_CSTAR,TI_flags(%ebp)
+ call do_syscall_trace
+ LOCK_PREFIX
+ andl $~_TIF_CSTAR,TI_flags(%ebp)
+ testl %eax,%eax
+ jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
+ # so must skip actual syscall
+ movl PT_ORIG_EAX(%esp),%eax
+ cmpl $nr_syscalls,%eax
+ jb .Lcstar_call
+ jmp .Lcstar_exit
+.Lcstar_trace_special:
+ movl PT_ECX(%esp),%ecx
+ movl %esp,%eax
+ xorl %edx,%edx
+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
+ call do_syscall_trace
+ testl %eax,%eax
+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
+ # so must skip actual syscall
+ movl PT_ORIG_EAX(%esp),%eax
+ cmpl $nr_syscalls,%eax
+ jb syscall_call
+ jmp syscall_exit
+cstar_badsys:
+ movl $-ENOSYS,PT_EAX(%esp)
+.Lcstar_resume:
+ movl PT_ECX(%esp),%ecx
+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
+ jmp resume_userspace
+ CFI_RESTORE_STATE
+cstar_fault:
+ movl $-EFAULT,%eax
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+ jmp .Lcstar_resume
+ CFI_ENDPROC
+ENDPROC(ia32pv_cstar_target)
+
+ENTRY(cstar_ret_from_fork)
+ CFI_STARTPROC
+ movl PT_ECX(%esp),%ecx
+ GET_THREAD_INFO(%ebp)
+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
+ LOCK_PREFIX
+ andl $~_TIF_CSTAR,TI_flags(%ebp)
+ jmp ret_from_fork
+ CFI_ENDPROC
+END(ret_from_fork)
+
+.section .rodata,"a"
+#include "syscall_table_32.S"
+
+syscall_table_size=(.-sys_call_table)
+
+#include <asm/unistd.h>
+cstar_special:
+nr=0
+mask=0
+.rept nr_syscalls+31
+ .irp n, __NR_sigreturn, __NR_rt_sigreturn
+ .if nr == \n
+ mask = mask | (1 << (\n & 31))
+ .endif
+ .endr
+ nr = nr + 1
+ .if (nr & 31) == 0
+ .long mask
+ mask = 0
+ .endif
+.endr
+#define sys_call_table cstar_call_table
+#define sys_fork cstar_set_tif
+#define sys_clone cstar_set_tif
+#define sys_vfork cstar_set_tif
+#include "syscall_table_32.S"
+#undef sys_call_table
+#undef sys_fork
+#undef sys_clone
+#undef sys_vfork
CFI_SIGNAL_FRAME
CFI_DEF_CFA esp, 0
CFI_REGISTER esp, ebp
- movl TSS_sysenter_sp0(%esp),%esp
+ movl SYSENTER_stack_sp0(%esp),%esp
sysenter_past_esp:
/*
* Interrupts are disabled here, but we can't trace it until
* that sets up the real kernel stack. Check here, since we can't
* allow the wrong stack to be used.
*
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
* already pushed 3 words if it hits on the sysenter instruction:
* eflags, cs and eip.
*
cmpw $__KERNEL_CS,4(%esp); \
jne ok; \
label: \
- movl TSS_sysenter_sp0+offset(%esp),%esp; \
+ movl SYSENTER_stack_sp0+offset(%esp),%esp; \
CFI_DEF_CFA esp, 0; \
CFI_UNDEFINED eip; \
pushfl; \
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
entrypoint expects, so fix it up before using the normal path. */
ENTRY(xen_sysenter_target)
.previous
ENDPROC(xen_failsafe_callback)
-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
.section .rodata,"a"
#include "syscall_table_32.S"
--- /dev/null
+/*
+ * linux/arch/x86_64/entry.S
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Asit Mallick <asit.k.mallick@intel.com>
+ * Modified for Xen
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
+ * only done for syscall tracing, signals or fork/exec et.al.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
+ * - partial stack frame: partially saved registers upto R11.
+ * - full stack frame: Like partial stack frame, but all register saved.
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/features.h>
+
+ .code64
+
+#ifndef CONFIG_PREEMPT
+#define retint_kernel retint_restore_args
+#endif
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_irq_enable_syscall_ret)
+ movq %gs:pda_oldrsp,%rsp
+ swapgs
+ sysretq
+#endif /* CONFIG_PARAVIRT */
+
+
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+NMI_MASK = 0x80000000
+
+/*
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */
+
+ /* %rsp:at FRAMEEND */
+ .macro FIXUP_TOP_OF_STACK tmp
+ movq $__USER_CS,CS(%rsp)
+ movq $-1,RCX(%rsp)
+ .endm
+
+ .macro RESTORE_TOP_OF_STACK tmp,offset=0
+ .endm
+
+ .macro FAKE_STACK_FRAME child_rip
+ /* push in order ss, rsp, eflags, cs, rip */
+ xorl %eax, %eax
+ pushq %rax /* ss */
+ CFI_ADJUST_CFA_OFFSET 8
+ /*CFI_REL_OFFSET ss,0*/
+ pushq %rax /* rsp */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rsp,0
+ pushq $(1<<9) /* eflags - interrupts on */
+ CFI_ADJUST_CFA_OFFSET 8
+ /*CFI_REL_OFFSET rflags,0*/
+ pushq $__KERNEL_CS /* cs */
+ CFI_ADJUST_CFA_OFFSET 8
+ /*CFI_REL_OFFSET cs,0*/
+ pushq \child_rip /* rip */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rip,0
+ pushq %rax /* orig rax */
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro UNFAKE_STACK_FRAME
+ addq $8*6, %rsp
+ CFI_ADJUST_CFA_OFFSET -(6*8)
+ .endm
+
+ .macro CFI_DEFAULT_STACK start=1,adj=0
+ .if \start
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-(\adj*ARGOFFSET)
+ .else
+ CFI_DEF_CFA_OFFSET SS+8-(\adj*ARGOFFSET)
+ .endif
+ .if \adj == 0
+ CFI_REL_OFFSET r15,R15
+ CFI_REL_OFFSET r14,R14
+ CFI_REL_OFFSET r13,R13
+ CFI_REL_OFFSET r12,R12
+ CFI_REL_OFFSET rbp,RBP
+ CFI_REL_OFFSET rbx,RBX
+ .endif
+ CFI_REL_OFFSET r11,R11
+ CFI_REL_OFFSET r10,R10
+ CFI_REL_OFFSET r9,R9
+ CFI_REL_OFFSET r8,R8
+ CFI_REL_OFFSET rax,RAX
+ CFI_REL_OFFSET rcx,RCX
+ CFI_REL_OFFSET rdx,RDX
+ CFI_REL_OFFSET rsi,RSI
+ CFI_REL_OFFSET rdi,RDI
+ CFI_REL_OFFSET rip,RIP
+ /*CFI_REL_OFFSET cs,CS*/
+ /*CFI_REL_OFFSET rflags,EFLAGS*/
+ CFI_REL_OFFSET rsp,RSP
+ /*CFI_REL_OFFSET ss,SS*/
+ .endm
+
+ /*
+ * Must be consistent with the definition in arch-x86/xen-x86_64.h:
+ * struct iret_context {
+ * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ * };
+ * with rax, r11, and rcx being taken care of in the hypercall stub.
+ */
+ .macro HYPERVISOR_IRET flag
+ testb $3,1*8(%rsp)
+ jnz 2f
+ testl $NMI_MASK,2*8(%rsp)
+ jnz 2f
+
+ cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
+ jne 1f
+
+ /* Direct iret to kernel space. Correct CS and SS. */
+ orl $3,1*8(%rsp)
+ orl $3,4*8(%rsp)
+1: iretq
+
+2: /* Slow iret via hypervisor. */
+ andl $~NMI_MASK, 2*8(%rsp)
+ pushq $\flag
+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
+ .endm
+
+/*
+ * A newly forked process directly context switches into this.
+ */
+/* rdi: prev */
+ENTRY(ret_from_fork)
+ CFI_DEFAULT_STACK
+ push kernel_eflags(%rip)
+ CFI_ADJUST_CFA_OFFSET 4
+ popf # reset kernel eflags
+ CFI_ADJUST_CFA_OFFSET -4
+ call schedule_tail
+ GET_THREAD_INFO(%rcx)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+ jnz rff_trace
+rff_action:
+ RESTORE_REST
+ testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
+ je int_ret_from_sys_call
+ testl $_TIF_IA32,threadinfo_flags(%rcx)
+ jnz int_ret_from_sys_call
+ RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
+ jmp ret_from_sys_call
+rff_trace:
+ movq %rsp,%rdi
+ call syscall_trace_leave
+ GET_THREAD_INFO(%rcx)
+ jmp rff_action
+ CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * initial frame state for interrupts and exceptions
+ */
+ .macro _frame ref
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-\ref
+ /*CFI_REL_OFFSET ss,SS-\ref*/
+ CFI_REL_OFFSET rsp,RSP-\ref
+ /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
+ /*CFI_REL_OFFSET cs,CS-\ref*/
+ CFI_REL_OFFSET rip,RIP-\ref
+ .endm
+
+/*
+ * System call entry. Upto 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+
+/*
+ * Register setup:
+ * rax system call number
+ * rdi arg0
+ * rcx return address for syscall/sysret, C arg3
+ * rsi arg1
+ * rdx arg2
+ * r10 arg3 (--> moved to rcx for C)
+ * r8 arg4
+ * r9 arg5
+ * r11 eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
+ * Interrupts are enabled on entry.
+ * Only called from user space.
+ *
+ * XXX if we had a free scratch register we could save the RSP into the stack frame
+ * and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+ENTRY(system_call)
+ _frame (RIP-0x10)
+ SAVE_ARGS -8,0
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ GET_THREAD_INFO(%rcx)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+ jnz tracesys
+ cmpq $__NR_syscall_max,%rax
+ ja badsys
+ movq %r10,%rcx
+ call *sys_call_table(,%rax,8) # XXX: rip relative
+ movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack.
+ */
+ret_from_sys_call:
+ movl $_TIF_ALLWORK_MASK,%edi
+ /* edi: flagmask */
+sysret_check:
+ LOCKDEP_SYS_EXIT
+ GET_THREAD_INFO(%rcx)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ movl threadinfo_flags(%rcx),%edx
+ andl %edi,%edx
+ jnz sysret_careful
+ CFI_REMEMBER_STATE
+ /*
+ * sysretq will re-enable interrupts:
+ */
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ RESTORE_ARGS 0,8,0
+ HYPERVISOR_IRET VGCF_IN_SYSCALL
+
+ CFI_RESTORE_STATE
+ /* Handle reschedules */
+ /* edx: work, edi: workmask */
+sysret_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc sysret_signal
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ CFI_ADJUST_CFA_OFFSET 8
+ call schedule
+ popq %rdi
+ CFI_ADJUST_CFA_OFFSET -8
+ jmp sysret_check
+
+ /* Handle a signal */
+sysret_signal:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz 1f
+
+ /* Really a signal */
+ /* edx: work flags (arg3) */
+ leaq do_notify_resume(%rip),%rax
+ leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
+ xorl %esi,%esi # oldset -> arg2
+ call ptregscall_common
+1: movl $_TIF_NEED_RESCHED,%edi
+ /* Use IRET because user could have changed frame. This
+ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+
+badsys:
+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+ jmp ret_from_sys_call
+
+ /* Do syscall tracing */
+tracesys:
+ SAVE_REST
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+ FIXUP_TOP_OF_STACK %rdi
+ movq %rsp,%rdi
+ call syscall_trace_enter
+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ cmpq $__NR_syscall_max,%rax
+ ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
+ movq %r10,%rcx /* fixup for C */
+ call *sys_call_table(,%rax,8)
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ /* Use IRET because user could have changed frame */
+
+/*
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */
+ .globl int_ret_from_sys_call
+int_ret_from_sys_call:
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testb $3,CS-ARGOFFSET(%rsp)
+ jnz 1f
+ /* Need to set the proper %ss (not NULL) for ring 3 iretq */
+ movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
+ jmp retint_restore_args # retrun from ring3 kernel
+1:
+ movl $_TIF_ALLWORK_MASK,%edi
+ /* edi: mask to check */
+int_with_check:
+ LOCKDEP_SYS_EXIT_IRQ
+ GET_THREAD_INFO(%rcx)
+ movl threadinfo_flags(%rcx),%edx
+ andl %edi,%edx
+ jnz int_careful
+ andl $~TS_COMPAT,threadinfo_status(%rcx)
+ jmp retint_restore_args
+
+ /* Either reschedule or signal or syscall exit tracking needed. */
+ /* First do a reschedule test. */
+ /* edx: work, edi: workmask */
+int_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc int_very_careful
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ CFI_ADJUST_CFA_OFFSET 8
+ call schedule
+ popq %rdi
+ CFI_ADJUST_CFA_OFFSET -8
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+
+ /* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ SAVE_REST
+ /* Check for syscall exit trace */
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+ jz int_signal
+ pushq %rdi
+ CFI_ADJUST_CFA_OFFSET 8
+ leaq 8(%rsp),%rdi # &ptregs -> arg1
+ call syscall_trace_leave
+ popq %rdi
+ CFI_ADJUST_CFA_OFFSET -8
+ andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+ jmp int_restore_rest
+
+int_signal:
+ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz 1f
+ movq %rsp,%rdi # &ptregs -> arg1
+ xorl %esi,%esi # oldset -> arg2
+ call do_notify_resume
+1: movl $_TIF_NEED_RESCHED,%edi
+int_restore_rest:
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+ CFI_ENDPROC
+END(system_call)
+
+/*
+ * Certain special system calls that need to save a complete full stack frame.
+ */
+
+ .macro PTREGSCALL label,func,arg
+ .globl \label
+\label:
+ leaq \func(%rip),%rax
+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+ jmp ptregscall_common
+END(\label)
+ .endm
+
+ CFI_STARTPROC
+
+ PTREGSCALL stub_clone, sys_clone, %r8
+ PTREGSCALL stub_fork, sys_fork, %rdi
+ PTREGSCALL stub_vfork, sys_vfork, %rdi
+ PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+ PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+ PTREGSCALL stub_iopl, sys_iopl, %rsi
+
+ENTRY(ptregscall_common)
+ popq %r11
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_REGISTER rip, r11
+ SAVE_REST
+ movq %r11, %r15
+ CFI_REGISTER rip, r15
+ FIXUP_TOP_OF_STACK %r11
+ call *%rax
+ RESTORE_TOP_OF_STACK %r11
+ movq %r15, %r11
+ CFI_REGISTER rip, r11
+ RESTORE_REST
+ pushq %r11
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rip, 0
+ ret
+ CFI_ENDPROC
+END(ptregscall_common)
+
+ENTRY(stub_execve)
+ CFI_STARTPROC
+ popq %r11
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_REGISTER rip, r11
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ movq %rsp, %rcx
+ call sys_execve
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_execve)
+
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */
+ENTRY(stub_rt_sigreturn)
+ CFI_STARTPROC
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
+ SAVE_REST
+ movq %rsp,%rdi
+ FIXUP_TOP_OF_STACK %r11
+ call sys_rt_sigreturn
+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_rt_sigreturn)
+
+/* initial frame state for interrupts (and exceptions without error code) */
+#define INTR_FRAME _frame (RIP-0x10); \
+ CFI_REL_OFFSET rcx,0; \
+ CFI_REL_OFFSET r11,8
+
+/* initial frame state for exceptions with error code (and interrupts with
+ vector already pushed) */
+#define XCPT_FRAME _frame (RIP-0x18); \
+ CFI_REL_OFFSET rcx,0; \
+ CFI_REL_OFFSET r11,8
+
+/*
+ * Interrupt exit.
+ *
+ */
+
+retint_check:
+ CFI_DEFAULT_STACK adj=1
+ LOCKDEP_SYS_EXIT_IRQ
+ movl threadinfo_flags(%rcx),%edx
+ andl %edi,%edx
+ CFI_REMEMBER_STATE
+ jnz retint_careful
+retint_restore_args: /* return to kernel space */
+ movl EFLAGS-REST_SKIP(%rsp), %eax
+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
+ GET_VCPU_INFO
+ andb evtchn_upcall_mask(%rsi),%al
+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
+ jnz restore_all_enable_events # != 0 => enable event delivery
+
+ RESTORE_ARGS 0,8,0
+ HYPERVISOR_IRET 0
+
+ /* edi: workmask, edx: work */
+retint_careful:
+ CFI_RESTORE_STATE
+ bt $TIF_NEED_RESCHED,%edx
+ jnc retint_signal
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ CFI_ADJUST_CFA_OFFSET 8
+ call schedule
+ popq %rdi
+ CFI_ADJUST_CFA_OFFSET -8
+ GET_THREAD_INFO(%rcx)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp retint_check
+
+retint_signal:
+ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz retint_restore_args
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ SAVE_REST
+ movq $-1,ORIG_RAX(%rsp)
+ xorl %esi,%esi # oldset
+ movq %rsp,%rdi # &pt_regs
+ call do_notify_resume
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ movl $_TIF_NEED_RESCHED,%edi
+ GET_THREAD_INFO(%rcx)
+ jmp retint_check
+
+#ifdef CONFIG_PREEMPT
+ /* Returning to kernel space. Check if we need preemption */
+ /* rcx: threadinfo. interrupts off. */
+ENTRY(retint_kernel)
+ cmpl $0,threadinfo_preempt_count(%rcx)
+ jnz retint_restore_args
+ bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+ jnc retint_restore_args
+ bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
+ jnc retint_restore_args
+ call preempt_schedule_irq
+ jmp retint_kernel /* check again */
+#endif
+
+ CFI_ENDPROC
+END(retint_check)
+
+#ifndef CONFIG_XEN
+/*
+ * APIC interrupts.
+ */
+ .macro apicinterrupt num,func
+ INTR_FRAME
+ pushq $~(\num)
+ CFI_ADJUST_CFA_OFFSET 8
+ interrupt \func
+ jmp error_entry
+ CFI_ENDPROC
+ .endm
+
+ENTRY(thermal_interrupt)
+ apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+END(thermal_interrupt)
+
+ENTRY(threshold_interrupt)
+ apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+END(threshold_interrupt)
+
+#ifdef CONFIG_SMP
+ENTRY(reschedule_interrupt)
+ apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+END(reschedule_interrupt)
+
+ .macro INVALIDATE_ENTRY num
+ENTRY(invalidate_interrupt\num)
+ apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
+END(invalidate_interrupt\num)
+ .endm
+
+ INVALIDATE_ENTRY 0
+ INVALIDATE_ENTRY 1
+ INVALIDATE_ENTRY 2
+ INVALIDATE_ENTRY 3
+ INVALIDATE_ENTRY 4
+ INVALIDATE_ENTRY 5
+ INVALIDATE_ENTRY 6
+ INVALIDATE_ENTRY 7
+
+ENTRY(call_function_interrupt)
+ apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+END(call_function_interrupt)
+ENTRY(irq_move_cleanup_interrupt)
+ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
+END(irq_move_cleanup_interrupt)
+#endif
+
+ENTRY(apic_timer_interrupt)
+ apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+END(apic_timer_interrupt)
+
+ENTRY(error_interrupt)
+ apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+END(error_interrupt)
+
+ENTRY(spurious_interrupt)
+ apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
+END(spurious_interrupt)
+#endif /* !CONFIG_XEN */
+
+/*
+ * Exception entry points.
+ */
+ .macro zeroentry sym
+ INTR_FRAME
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x10,%rsp /* skip rcx and r11 */
+ CFI_ADJUST_CFA_OFFSET -0x10
+ pushq $0 /* push error code/oldrax */
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq %rax /* push real oldrax to the rdi slot */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rax,0
+ leaq \sym(%rip),%rax
+ jmp error_entry
+ CFI_ENDPROC
+ .endm
+
+ .macro errorentry sym
+ XCPT_FRAME
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x10,%rsp /* rsp points to the error code */
+ CFI_ADJUST_CFA_OFFSET -0x10
+ pushq %rax
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rax,0
+ leaq \sym(%rip),%rax
+ jmp error_entry
+ CFI_ENDPROC
+ .endm
+
+#if 0 /* not XEN */
+ /* error code is on the stack already */
+ /* handle NMI like exceptions that can happen everywhere */
+ .macro paranoidentry sym, ist=0, irqtrace=1
+ movq (%rsp),%rcx
+ movq 8(%rsp),%r11
+ addq $0x10,%rsp /* skip rcx and r11 */
+ SAVE_ALL
+ cld
+#if 0 /* not XEN */
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f
+ SWAPGS
+ xorl %ebx,%ebx
+1:
+#endif
+ .if \ist
+ movq %gs:pda_data_offset, %rbp
+ .endif
+ movq %rsp,%rdi
+ movq ORIG_RAX(%rsp),%rsi
+ movq $-1,ORIG_RAX(%rsp)
+ .if \ist
+ subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ .endif
+ call \sym
+ .if \ist
+ addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ .endif
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ .if \irqtrace
+ TRACE_IRQS_OFF
+ .endif
+ .endm
+
+ /*
+ * "Paranoid" exit path from exception stack.
+ * Paranoid because this is used by NMIs and cannot take
+ * any kernel state for granted.
+ * We don't do kernel preemption checks here, because only
+ * NMI should be common and it does not enable IRQs and
+ * cannot get reschedule ticks.
+ *
+ * "trace" is 0 for the NMI handler only, because irq-tracing
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+ .macro paranoidexit trace=1
+ /* ebx: no swapgs flag */
+paranoid_exit\trace:
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore\trace
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace\trace
+paranoid_swapgs\trace:
+ .if \trace
+ TRACE_IRQS_IRETQ 0
+ .endif
+ SWAPGS_UNSAFE_STACK
+paranoid_restore\trace:
+ RESTORE_ALL 8
+ jmp irq_return
+paranoid_userspace\trace:
+ GET_THREAD_INFO(%rcx)
+ movl threadinfo_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz paranoid_swapgs\trace
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz paranoid_schedule\trace
+ movl %ebx,%edx /* arg3: thread flags */
+ .if \trace
+ TRACE_IRQS_ON
+ .endif
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ .if \trace
+ TRACE_IRQS_OFF
+ .endif
+ jmp paranoid_userspace\trace
+paranoid_schedule\trace:
+ .if \trace
+ TRACE_IRQS_ON
+ .endif
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ .if \trace
+ TRACE_IRQS_OFF
+ .endif
+ jmp paranoid_userspace\trace
+ CFI_ENDPROC
+ .endm
+#endif
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack
+ * and the exception handler in %rax.
+ */
+KPROBE_ENTRY(error_entry)
+ _frame RDI
+ CFI_REL_OFFSET rax,0
+ /* rdi slot contains rax, oldrax contains error code */
+ cld
+ subq $14*8,%rsp
+ CFI_ADJUST_CFA_OFFSET (14*8)
+ movq %rsi,13*8(%rsp)
+ CFI_REL_OFFSET rsi,RSI
+ movq 14*8(%rsp),%rsi /* load rax from rdi slot */
+ CFI_REGISTER rax,rsi
+ movq %rdx,12*8(%rsp)
+ CFI_REL_OFFSET rdx,RDX
+ movq %rcx,11*8(%rsp)
+ CFI_REL_OFFSET rcx,RCX
+ movq %rsi,10*8(%rsp) /* store rax */
+ CFI_REL_OFFSET rax,RAX
+ movq %r8, 9*8(%rsp)
+ CFI_REL_OFFSET r8,R8
+ movq %r9, 8*8(%rsp)
+ CFI_REL_OFFSET r9,R9
+ movq %r10,7*8(%rsp)
+ CFI_REL_OFFSET r10,R10
+ movq %r11,6*8(%rsp)
+ CFI_REL_OFFSET r11,R11
+ movq %rbx,5*8(%rsp)
+ CFI_REL_OFFSET rbx,RBX
+ movq %rbp,4*8(%rsp)
+ CFI_REL_OFFSET rbp,RBP
+ movq %r12,3*8(%rsp)
+ CFI_REL_OFFSET r12,R12
+ movq %r13,2*8(%rsp)
+ CFI_REL_OFFSET r13,R13
+ movq %r14,1*8(%rsp)
+ CFI_REL_OFFSET r14,R14
+ movq %r15,(%rsp)
+ CFI_REL_OFFSET r15,R15
+#if 0
+ cmpl $__KERNEL_CS,CS(%rsp)
+ CFI_REMEMBER_STATE
+ je error_kernelspace
+#endif
+error_call_handler:
+ movq %rdi, RDI(%rsp)
+ CFI_REL_OFFSET rdi,RDI
+ movq %rsp,%rdi
+ movq ORIG_RAX(%rsp),%rsi # get error code
+ movq $-1,ORIG_RAX(%rsp)
+ call *%rax
+error_exit:
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ testb $3,CS-ARGOFFSET(%rsp)
+ jz retint_kernel
+ LOCKDEP_SYS_EXIT_IRQ
+ movl threadinfo_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
+ jmp retint_restore_args
+
+#if 0
+ /*
+ * We need to re-write the logic here because we don't do iretq to
+ * to return to user mode. It's still possible that we get trap/fault
+ * in the kernel (when accessing buffers pointed to by system calls,
+ * for example).
+ *
+ */
+ CFI_RESTORE_STATE
+error_kernelspace:
+ incl %ebx
+ /* There are two places in the kernel that can potentially fault with
+ usergs. Handle them here. The exception handlers after
+ iret run with kernel gs again, so don't set the user space flag.
+ B stepping K8s sometimes report an truncated RIP for IRET
+ exceptions returning to compat mode. Check for these here too. */
+ leaq irq_return(%rip),%rbp
+ cmpq %rbp,RIP(%rsp)
+ je error_swapgs
+ movl %ebp,%ebp /* zero extend */
+ cmpq %rbp,RIP(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP(%rsp)
+ je error_swapgs
+ jmp error_sti
+#endif
+ CFI_ENDPROC
+KPROBE_END(error_entry)
+
+ENTRY(hypervisor_callback)
+ zeroentry do_hypervisor_callback
+END(hypervisor_callback)
+
+/*
+ * Copied from arch/xen/i386/kernel/entry.S
+ */
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
+ CFI_STARTPROC
+# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+# see the correct pointer to the pt_regs
+ movq %rdi, %rsp # we don't return, adjust the stack frame
+ CFI_ENDPROC
+ CFI_DEFAULT_STACK
+11: incl %gs:pda_irqcount
+ movq %rsp,%rbp
+ CFI_DEF_CFA_REGISTER rbp
+ cmovzq %gs:pda_irqstackptr,%rsp
+ pushq %rbp # backlink for old unwinder
+ call evtchn_do_upcall
+ popq %rsp
+ CFI_DEF_CFA_REGISTER rsp
+ decl %gs:pda_irqcount
+ jmp error_exit
+ CFI_ENDPROC
+END(do_hypervisor_callback)
+
+ ALIGN
+restore_all_enable_events:
+ CFI_DEFAULT_STACK adj=1
+ TRACE_IRQS_ON
+ __ENABLE_INTERRUPTS
+
+scrit: /**** START OF CRITICAL REGION ****/
+ __TEST_PENDING
+ CFI_REMEMBER_STATE
+ jnz 14f # process more events if necessary...
+ RESTORE_ARGS 0,8,0
+ HYPERVISOR_IRET 0
+
+ CFI_RESTORE_STATE
+14: __DISABLE_INTERRUPTS
+ SAVE_REST
+ movq %rsp,%rdi # set the argument again
+ jmp 11b
+ CFI_ENDPROC
+ecrit: /**** END OF CRITICAL REGION ****/
+# At this point, unlike on x86-32, we don't do the fixup to simplify the
+# code and the stack frame is more complex on x86-64.
+# When the kernel is interrupted in the critical section, the kernel
+# will do IRET in that case, and everything will be restored at that point,
+# i.e. it just resumes from the next instruction interrupted with the same context.
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+# 1. Fault while reloading DS, ES, FS or GS
+# 2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+ENTRY(failsafe_callback)
+ _frame (RIP-0x30)
+ CFI_REL_OFFSET rcx, 0
+ CFI_REL_OFFSET r11, 8
+ movw %ds,%cx
+ cmpw %cx,0x10(%rsp)
+ CFI_REMEMBER_STATE
+ jne 1f
+ movw %es,%cx
+ cmpw %cx,0x18(%rsp)
+ jne 1f
+ movw %fs,%cx
+ cmpw %cx,0x20(%rsp)
+ jne 1f
+ movw %gs,%cx
+ cmpw %cx,0x28(%rsp)
+ jne 1f
+ /* All segments match their saved values => Category 2 (Bad IRET). */
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x30,%rsp
+ CFI_ADJUST_CFA_OFFSET -0x30
+ movq $11,%rdi /* SIGSEGV */
+ jmp do_exit
+ CFI_RESTORE_STATE
+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x30,%rsp
+ CFI_ADJUST_CFA_OFFSET -0x30
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ SAVE_ALL
+ jmp error_exit
+ CFI_ENDPROC
+#if 0
+ .section __ex_table,"a"
+ .align 8
+ .quad gs_change,bad_gs
+ .previous
+ .section .fixup,"ax"
+ /* running with kernelgs */
+bad_gs:
+/* swapgs */ /* switch back to user gs */
+ xorl %eax,%eax
+ movl %eax,%gs
+ jmp 2b
+ .previous
+#endif
+
+/*
+ * Create a kernel thread.
+ *
+ * C extern interface:
+ * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ *
+ * asm input arguments:
+ * rdi: fn, rsi: arg, rdx: flags
+ */
+ENTRY(kernel_thread)
+ CFI_STARTPROC
+ FAKE_STACK_FRAME $child_rip
+ SAVE_ALL
+
+ # rdi: flags, rsi: usp, rdx: will be &pt_regs
+ movq %rdx,%rdi
+ orq kernel_thread_flags(%rip),%rdi
+ movq $-1, %rsi
+ movq %rsp, %rdx
+
+ xorl %r8d,%r8d
+ xorl %r9d,%r9d
+
+ # clone now
+ call do_fork
+ movq %rax,RAX(%rsp)
+ xorl %edi,%edi
+
+ /*
+ * It isn't worth to check for reschedule here,
+ * so internally to the x86_64 port you can rely on kernel_thread()
+ * not to reschedule the child before returning, this avoids the need
+ * of hacks for example to fork off the per-CPU idle tasks.
+ * [Hopefully no generic code relies on the reschedule -AK]
+ */
+ RESTORE_ALL
+ UNFAKE_STACK_FRAME
+ ret
+ CFI_ENDPROC
+ENDPROC(kernel_thread)
+
+child_rip:
+ pushq $0 # fake return address
+ CFI_STARTPROC
+ /*
+ * Here we are in the child and the registers are set as they were
+ * at kernel_thread() invocation in the parent.
+ */
+ movq %rdi, %rax
+ movq %rsi, %rdi
+ call *%rax
+ # exit
+ mov %eax, %edi
+ call do_exit
+ CFI_ENDPROC
+ENDPROC(child_rip)
+
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ * extern long execve(char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ * rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
+ */
+ENTRY(kernel_execve)
+ CFI_STARTPROC
+ FAKE_STACK_FRAME $0
+ SAVE_ALL
+ movq %rsp,%rcx
+ call sys_execve
+ movq %rax, RAX(%rsp)
+ RESTORE_REST
+ testq %rax,%rax
+ jne 1f
+ jmp int_ret_from_sys_call
+1: RESTORE_ARGS
+ UNFAKE_STACK_FRAME
+ ret
+ CFI_ENDPROC
+ENDPROC(kernel_execve)
+
+KPROBE_ENTRY(page_fault)
+ errorentry do_page_fault
+KPROBE_END(page_fault)
+
+ENTRY(coprocessor_error)
+ zeroentry do_coprocessor_error
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+ zeroentry do_simd_coprocessor_error
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+ zeroentry math_state_restore
+END(device_not_available)
+
+ /* runs on exception stack */
+KPROBE_ENTRY(debug)
+/* INTR_FRAME
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8 */
+ zeroentry do_debug
+/* paranoidexit
+ CFI_ENDPROC */
+KPROBE_END(debug)
+
+KPROBE_ENTRY(nmi)
+ zeroentry do_nmi_callback
+KPROBE_END(nmi)
+do_nmi_callback:
+ CFI_STARTPROC
+ addq $8, %rsp
+ CFI_ENDPROC
+ CFI_DEFAULT_STACK
+ call do_nmi
+ orl $NMI_MASK,EFLAGS(%rsp)
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ jmp retint_restore_args
+ CFI_ENDPROC
+END(do_nmi_callback)
+
+KPROBE_ENTRY(int3)
+/* INTR_FRAME
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8 */
+ zeroentry do_int3
+/* jmp paranoid_exit1
+ CFI_ENDPROC */
+KPROBE_END(int3)
+
+ENTRY(overflow)
+ zeroentry do_overflow
+END(overflow)
+
+ENTRY(bounds)
+ zeroentry do_bounds
+END(bounds)
+
+ENTRY(invalid_op)
+ zeroentry do_invalid_op
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+ zeroentry do_coprocessor_segment_overrun
+END(coprocessor_segment_overrun)
+
+ENTRY(reserved)
+ zeroentry do_reserved
+END(reserved)
+
+#if 0
+ /* runs on exception stack */
+ENTRY(double_fault)
+ XCPT_FRAME
+ paranoidentry do_double_fault
+ jmp paranoid_exit1
+ CFI_ENDPROC
+END(double_fault)
+#endif
+
+ENTRY(invalid_TSS)
+ errorentry do_invalid_TSS
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+ errorentry do_segment_not_present
+END(segment_not_present)
+
+ /* runs on exception stack */
+ENTRY(stack_segment)
+/* XCPT_FRAME
+ paranoidentry do_stack_segment */
+ errorentry do_stack_segment
+/* jmp paranoid_exit1
+ CFI_ENDPROC */
+END(stack_segment)
+
+KPROBE_ENTRY(general_protection)
+ errorentry do_general_protection
+KPROBE_END(general_protection)
+
+ENTRY(alignment_check)
+ errorentry do_alignment_check
+END(alignment_check)
+
+ENTRY(divide_error)
+ zeroentry do_divide_error
+END(divide_error)
+
+ENTRY(spurious_interrupt_bug)
+ zeroentry do_spurious_interrupt_bug
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_X86_MCE
+ /* runs on exception stack */
+ENTRY(machine_check)
+ INTR_FRAME
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ paranoidentry do_machine_check
+ jmp paranoid_exit1
+ CFI_ENDPROC
+END(machine_check)
+#endif
+
+/* Call softirq on interrupt stack. Interrupts are off. */
+ENTRY(call_softirq)
+ CFI_STARTPROC
+ push %rbp
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbp,0
+ mov %rsp,%rbp
+ CFI_DEF_CFA_REGISTER rbp
+ incl %gs:pda_irqcount
+ cmove %gs:pda_irqstackptr,%rsp
+ push %rbp # backlink for old unwinder
+ call __do_softirq
+ leaveq
+ CFI_DEF_CFA_REGISTER rsp
+ CFI_ADJUST_CFA_OFFSET -8
+ decl %gs:pda_irqcount
+ ret
+ CFI_ENDPROC
+ENDPROC(call_softirq)
+
+KPROBE_ENTRY(ignore_sysret)
+ CFI_STARTPROC
+ mov $-ENOSYS,%eax
+ HYPERVISOR_IRET 0
+ CFI_ENDPROC
+ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+ CFI_STARTPROC
+ movq %r15, R15(%rdi)
+ movq %r14, R14(%rdi)
+ xchgq %rsi, %rdx
+ movq %r13, R13(%rdi)
+ movq %r12, R12(%rdi)
+ xorl %eax, %eax
+ movq %rbp, RBP(%rdi)
+ movq %rbx, RBX(%rdi)
+ movq (%rsp), %rcx
+ movq %rax, R11(%rdi)
+ movq %rax, R10(%rdi)
+ movq %rax, R9(%rdi)
+ movq %rax, R8(%rdi)
+ movq %rax, RAX(%rdi)
+ movq %rax, RCX(%rdi)
+ movq %rax, RDX(%rdi)
+ movq %rax, RSI(%rdi)
+ movq %rax, RDI(%rdi)
+ movq %rax, ORIG_RAX(%rdi)
+ movq %rcx, RIP(%rdi)
+ leaq 8(%rsp), %rcx
+ movq $__KERNEL_CS, CS(%rdi)
+ movq %rax, EFLAGS(%rdi)
+ movq %rcx, RSP(%rdi)
+ movq $__KERNEL_DS, SS(%rdi)
+ jmpq *%rdx
+ CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
--- /dev/null
+/******************************************************************************
+ * fixup.c
+ *
+ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
+ * Used to avoid repeated slow emulation of common instructions used by the
+ * user-space TLS (Thread-Local Storage) libraries.
+ *
+ * **** NOTE ****
+ * Issues with the binary rewriting have caused it to be removed. Instead
+ * we rely on Xen's emulator to boot the kernel, and then print a banner
+ * message recommending that the user disables /lib/tls.
+ *
+ * Copyright (c) 2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/version.h>
+
+#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
+
+void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
+{
+ static unsigned long printed = 0;
+ char info[100];
+ int i;
+
+ /* Ignore statically-linked init. */
+ if (current->tgid == 1)
+ return;
+
+ VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
+ VMASST_TYPE_4gb_segments_notify));
+
+ if (test_and_set_bit(0, &printed))
+ return;
+
+ sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
+
+ DP("");
+ DP("***************************************************************");
+ DP("***************************************************************");
+ DP("** WARNING: Currently emulating unsupported memory accesses **");
+ DP("** in /lib/tls glibc libraries. The emulation is **");
+ DP("** slow. To ensure full performance you should **");
+ DP("** install a 'xen-friendly' (nosegneg) version of **");
+ DP("** the library, or disable tls support by executing **");
+ DP("** the following as root: **");
+ DP("** mv /lib/tls /lib/tls.disabled **");
+ DP("** Offending process: %-38.38s **", info);
+ DP("***************************************************************");
+ DP("***************************************************************");
+ DP("");
+
+ for (i = 5; i > 0; i--) {
+ touch_softlockup_watchdog();
+ printk("Pausing... %d", i);
+ mdelay(1000);
+ printk("\b\b\b\b\b\b\b\b\b\b\b\b");
+ }
+
+ printk("Continuing...\n\n");
+}
+
+static int __init fixup_init(void)
+{
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_4gb_segments_notify));
+ return 0;
+}
+__initcall(fixup_init);
--- /dev/null
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+
+#include <asm/smp.h>
+#include <asm/genapic.h>
+
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+
+#ifndef CONFIG_XEN
+DEFINE_PER_CPU(int, x2apic_extra_bits);
+
+struct genapic __read_mostly *genapic = &apic_flat;
+
+static enum uv_system_type uv_system_type;
+#else
+extern struct genapic apic_xen;
+struct genapic __read_mostly *genapic = &apic_xen;
+#endif
+
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init setup_apic_routing(void)
+{
+#ifndef CONFIG_XEN
+ if (uv_system_type == UV_NON_UNIQUE_APIC)
+ genapic = &apic_x2apic_uv_x;
+ else
+#ifdef CONFIG_ACPI
+ /*
+ * Quirk: some x86_64 machines can only use physical APIC mode
+ * regardless of how many processors are present (x86_64 ES7000
+ * is an example).
+ */
+ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+ genapic = &apic_physflat;
+ else
+#endif
+
+ if (num_possible_cpus() <= 8)
+ genapic = &apic_flat;
+ else
+ genapic = &apic_physflat;
+
+#else
+ /* hardcode to xen apic functions */
+ genapic = &apic_xen;
+#endif
+ printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+}
+
+/* Same for both flat and physical. */
+
+#ifdef CONFIG_XEN
+extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector);
+#endif
+
+void send_IPI_self(int vector)
+{
+#ifndef CONFIG_XEN
+ __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+#else
+ xen_send_IPI_shortcut(APIC_DEST_SELF, vector);
+#endif
+}
+
+int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+#ifndef CONFIG_XEN
+ if (!strcmp(oem_id, "SGI")) {
+ if (!strcmp(oem_table_id, "UVL"))
+ uv_system_type = UV_LEGACY_APIC;
+ else if (!strcmp(oem_table_id, "UVX"))
+ uv_system_type = UV_X2APIC;
+ else if (!strcmp(oem_table_id, "UVH"))
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ }
+#endif
+ return 0;
+}
+
+#ifndef CONFIG_XEN
+unsigned int read_apic_id(void)
+{
+ unsigned int id;
+
+ WARN_ON(preemptible() && num_online_cpus() > 1);
+ id = apic_read(APIC_ID);
+ if (uv_system_type >= UV_X2APIC)
+ id |= __get_cpu_var(x2apic_extra_bits);
+ return id;
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+ return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+ return uv_system_type != UV_NONE;
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Xen APIC subarch code. Maximum 8 CPUs, logical delivery.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ *
+ * Hacked to pieces for Xen by Chris Wright.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#include <asm/smp.h>
+#else
+#include <asm/apic.h>
+#endif
+#include <asm/genapic.h>
+#include <xen/evtchn.h>
+
+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
+
+static inline void __send_IPI_one(unsigned int cpu, int vector)
+{
+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
+ BUG_ON(irq < 0);
+ notify_remote_via_irq(irq);
+}
+
+void xen_send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+ int cpu;
+
+ switch (shortcut) {
+ case APIC_DEST_SELF:
+ __send_IPI_one(smp_processor_id(), vector);
+ break;
+ case APIC_DEST_ALLBUT:
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu == smp_processor_id())
+ continue;
+ if (cpu_isset(cpu, cpu_online_map)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
+ break;
+ case APIC_DEST_ALLINC:
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu_isset(cpu, cpu_online_map)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
+ break;
+ default:
+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
+ vector);
+ break;
+ }
+}
+
+static cpumask_t xen_target_cpus(void)
+{
+ return cpu_online_map;
+}
+
+static cpumask_t xen_vector_allocation_domain(int cpu)
+{
+ return cpumask_of_cpu(cpu);
+}
+
+/*
+ * Set up the logical destination ID.
+ * Do nothing, not called now.
+ */
+static void xen_init_apic_ldr(void)
+{
+ Dprintk("%s\n", __FUNCTION__);
+ return;
+}
+
+static void xen_send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then
+ * we get an APIC send error if we try to broadcast.
+ * thus we have to avoid sending IPIs in this case.
+ */
+ Dprintk("%s\n", __FUNCTION__);
+ if (num_online_cpus() > 1)
+ xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}
+
+static void xen_send_IPI_all(int vector)
+{
+ Dprintk("%s\n", __FUNCTION__);
+ xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}
+
+static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+ unsigned long mask = cpus_addr(cpumask)[0];
+ unsigned int cpu;
+ unsigned long flags;
+
+ Dprintk("%s\n", __FUNCTION__);
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu_isset(cpu, cpumask)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static int xen_apic_id_registered(void)
+{
+ /* better be set */
+ Dprintk("%s\n", __FUNCTION__);
+ return physid_isset(smp_processor_id(), phys_cpu_present_map);
+}
+#endif
+
+static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ Dprintk("%s\n", __FUNCTION__);
+ return cpus_addr(cpumask)[0];
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+ u32 ebx;
+
+ Dprintk("%s\n", __FUNCTION__);
+ ebx = cpuid_ebx(1);
+ return ((ebx >> 24) & 0xFF) >> index_msb;
+}
+
+struct genapic apic_xen = {
+ .name = "xen",
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+ .int_delivery_mode = dest_LowestPrio,
+#endif
+ .int_dest_mode = 1,
+ .target_cpus = xen_target_cpus,
+ .vector_allocation_domain = xen_vector_allocation_domain,
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+ .apic_id_registered = xen_apic_id_registered,
+#endif
+ .init_apic_ldr = xen_init_apic_ldr,
+ .send_IPI_all = xen_send_IPI_all,
+ .send_IPI_allbutself = xen_send_IPI_allbutself,
+ .send_IPI_mask = xen_send_IPI_mask,
+ .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id,
+};
--- /dev/null
+/*
+ * prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Modified for Xen.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <linux/start_kernel.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
+unsigned long start_pfn;
+
+#ifndef CONFIG_XEN
+static void __init zap_identity_mappings(void)
+{
+ pgd_t *pgd = pgd_offset_k(0UL);
+ pgd_clear(pgd);
+ __flush_tlb_all();
+}
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized
+ yet. */
+static void __init clear_bss(void)
+{
+ memset(__bss_start, 0,
+ (unsigned long) __bss_stop - (unsigned long) __bss_start);
+}
+#endif
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+#ifndef CONFIG_XEN
+ char * command_line;
+
+ memcpy(&boot_params, real_mode_data, sizeof boot_params);
+ if (boot_params.hdr.cmd_line_ptr) {
+ command_line = __va(boot_params.hdr.cmd_line_ptr);
+ memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+ }
+#else
+ int max_cmdline;
+
+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+ max_cmdline = COMMAND_LINE_SIZE;
+ memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
+ boot_command_line[max_cmdline-1] = '\0';
+#endif
+}
+
+#include <xen/interface/memory.h>
+unsigned long *machine_to_phys_mapping;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+static void __init reserve_ebda_region(void)
+{
+#ifndef CONFIG_XEN
+ unsigned int lowmem, ebda_addr;
+
+ /* To determine the position of the EBDA and the */
+ /* end of conventional memory, we need to look at */
+ /* the BIOS data area. In a paravirtual environment */
+ /* that area is absent. We'll just have to assume */
+ /* that the paravirt case can handle memory setup */
+ /* correctly, without our help. */
+ if (paravirt_enabled())
+ return;
+
+ /* end of low (conventional) memory */
+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+ lowmem <<= 10;
+
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
+ if ((lowmem - ebda_addr) <= 0x10000)
+ lowmem = ebda_addr;
+
+ /* Fixup: bios does not report an EBDA at all. */
+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+ lowmem = 0x9f000;
+
+ /* Paranoia: should never happen, but... */
+ if ((lowmem == 0) || (lowmem >= 0x100000))
+ lowmem = 0x9f000;
+
+ /* reserve all memory between lowmem and the 1MB mark */
+ reserve_early(lowmem, 0x100000, "BIOS reserved");
+#endif
+}
+
+static void __init reserve_setup_data(void)
+{
+#ifndef CONFIG_XEN
+ struct setup_data *data;
+ unsigned long pa_data;
+ char buf[32];
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ sprintf(buf, "setup data %x", data->type);
+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+#endif
+}
+
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+ struct xen_machphys_mapping mapping;
+ unsigned long machine_to_phys_nr_ents;
+ int i;
+
+ /*
+ * Build-time sanity checks on the kernel image and module
+ * area mappings. (these are purely build-time and produce no code)
+ */
+ BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
+ BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+ BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+ BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+ BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ (__START_KERNEL & PGDIR_MASK)));
+
+ xen_setup_features();
+
+ xen_start_info = (struct start_info *)real_mode_data;
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ phys_to_machine_mapping =
+ (unsigned long *)xen_start_info->mfn_list;
+ start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+ xen_start_info->nr_pt_frames;
+
+ machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
+ }
+ while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
+ machine_to_phys_order++;
+
+#ifndef CONFIG_XEN
+ /* clear bss before set_intr_gate with early_idt_handler */
+ clear_bss();
+
+ /* Make NULL pointers segfault */
+ zap_identity_mappings();
+
+ /* Cleanup the over mapped high alias */
+ cleanup_highmap();
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
+#ifdef CONFIG_EARLY_PRINTK
+ set_intr_gate(i, &early_idt_handlers[i]);
+#else
+ set_intr_gate(i, early_idt_handler);
+#endif
+ }
+ load_idt((const struct desc_ptr *)&idt_descr);
+#endif
+
+ early_printk("Kernel alive\n");
+
+ for (i = 0; i < NR_CPUS; i++)
+ cpu_pda(i) = &boot_cpu_pda[i];
+
+ pda_init(0);
+ copy_bootdata(__va(real_mode_data));
+
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+ reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
+ start_pfn << PAGE_SHIFT, "Xen provided");
+
+ reserve_ebda_region();
+ reserve_setup_data();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
+ start_kernel();
+}
--- /dev/null
+
+
+.text
+#include <linux/elfnote.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/boot.h>
+#include <asm/dwarf2.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/elfnote.h>
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86 new_cpu_data+CPUINFO_x86
+#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL new_cpu_data+CPUINFO_x86_model
+#define X86_MASK new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
+
+.section .text.head,"ax",@progbits
+#define VIRT_ENTRY_OFFSET 0x0
+.org VIRT_ENTRY_OFFSET
+ENTRY(startup_32)
+ movl %esi,xen_start_info
+ cld
+
+ /* Set up the stack pointer */
+ movl $(init_thread_union+THREAD_SIZE),%esp
+
+ movl %ss,%eax
+ movl %eax,%fs # gets reset once there's real percpu
+
+ /* get vendor info */
+ xorl %eax,%eax # call CPUID with 0 -> return vendor ID
+ XEN_CPUID
+ movl %eax,X86_CPUID # save CPUID level
+ movl %ebx,X86_VENDOR_ID # lo 4 chars
+ movl %edx,X86_VENDOR_ID+4 # next 4 chars
+ movl %ecx,X86_VENDOR_ID+8 # last 4 chars
+
+ movl $1,%eax # Use the CPUID instruction to get CPU type
+ XEN_CPUID
+ movb %al,%cl # save reg for future use
+ andb $0x0f,%ah # mask processor family
+ movb %ah,X86
+ andb $0xf0,%al # mask model
+ shrb $4,%al
+ movb %al,X86_MODEL
+ andb $0x0f,%cl # mask mask revision
+ movb %cl,X86_MASK
+ movl %edx,X86_CAPABILITY
+
+ movb $1,X86_HARD_MATH
+
+ xorl %eax,%eax # Clear GS
+ movl %eax,%gs
+
+ cld # gcc2 wants the direction flag cleared at all times
+
+ pushl $0 # fake return address for unwinder
+ jmp i386_start_kernel
+
+#define HYPERCALL_PAGE_OFFSET 0x1000
+.org HYPERCALL_PAGE_OFFSET
+ENTRY(hypercall_page)
+ CFI_STARTPROC
+.skip 0x1000
+ CFI_ENDPROC
+
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
+
+/*
+ * BSS section
+ */
+.section ".bss.page_aligned","wa"
+ .align PAGE_SIZE_asm
+ENTRY(swapper_pg_fixmap)
+ .fill 1024,4,0
+ENTRY(empty_zero_page)
+ .fill 4096,1,0
+
+/*
+ * This starts the data section.
+ */
+.data
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+/*
+ * __xen_guest information
+ */
+.macro utoa value
+ .if (\value) < 0 || (\value) >= 0x10
+ utoa (((\value)>>4)&0x0fffffff)
+ .endif
+ .if ((\value) & 0xf) < 10
+ .byte '0' + ((\value) & 0xf)
+ .else
+ .byte 'A' + ((\value) & 0xf) - 10
+ .endif
+.endm
+
+.section __xen_guest
+ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
+ .ascii ",XEN_VER=xen-3.0"
+ .ascii ",VIRT_BASE=0x"
+ utoa __PAGE_OFFSET
+ .ascii ",ELF_PADDR_OFFSET=0x"
+ utoa __PAGE_OFFSET
+ .ascii ",VIRT_ENTRY=0x"
+ utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET)
+ .ascii ",HYPERCALL_PAGE=0x"
+ utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
+ .ascii ",FEATURES=writable_page_tables"
+ .ascii "|writable_descriptor_tables"
+ .ascii "|auto_translated_physmap"
+ .ascii "|pae_pgdir_above_4gb"
+ .ascii "|supervisor_mode_kernel"
+#ifdef CONFIG_X86_PAE
+ .ascii ",PAE=yes[extended-cr3]"
+#else
+ .ascii ",PAE=no"
+#endif
+ .ascii ",LOADER=generic"
+ .byte 0
+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
+
+
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
+#if CONFIG_XEN_COMPAT <= 0x030002
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long __PAGE_OFFSET)
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long 0)
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_32)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long HYPERVISOR_VIRT_START)
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
+#ifdef CONFIG_X86_PAE
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT)
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long _PAGE_PRESENT, _PAGE_PRESENT)
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
--- /dev/null
+/*
+ * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Modified for Xen
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/elfnote.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+#include <asm/dwarf2.h>
+#include <xen/interface/elfnote.h>
+
+ .section .text.head, "ax", @progbits
+ .code64
+ .globl startup_64
+startup_64:
+ movq $(init_thread_union+THREAD_SIZE-8),%rsp
+
+ /* rsi is pointer to startup info structure.
+ pass it to C */
+ movq %rsi,%rdi
+ pushq $0 # fake return address
+ jmp x86_64_start_kernel
+
+#ifdef CONFIG_ACPI_SLEEP
+.org 0xf00
+ .globl pGDT32
+pGDT32:
+ .word gdt_end-cpu_gdt_table-1
+ .long cpu_gdt_table-__START_KERNEL_map
+#endif
+
+.balign PAGE_SIZE
+
+#define NEXT_PAGE(name) \
+ .balign PAGE_SIZE; \
+ phys_##name = . - .text.head; \
+ENTRY(name)
+
+NEXT_PAGE(init_level4_pgt)
+ .fill 512,8,0
+ /*
+ * We update two pgd entries to make kernel and user pgd consistent
+ * at pgd_populate(). It can be used for kernel modules. So we place
+ * this page here for those cases to avoid memory corruption.
+ * We also use this page to establish the initial mapping for the
+ * vsyscall area.
+ */
+ .fill 512,8,0
+
+NEXT_PAGE(level3_kernel_pgt)
+ .fill 512,8,0
+
+ /*
+ * This is used for vsyscall area mapping as we have a different
+ * level4 page table for user.
+ */
+NEXT_PAGE(level3_user_pgt)
+ .fill 512,8,0
+
+NEXT_PAGE(level2_kernel_pgt)
+ .fill 512,8,0
+
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill 512,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .fill 512,8,0
+
+NEXT_PAGE(hypercall_page)
+ CFI_STARTPROC
+ .rept 0x1000 / 0x20
+ .skip 1 /* push %rcx */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx,0
+ .skip 2 /* push %r11 */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx,0
+ .skip 5 /* mov $#,%eax */
+ .skip 2 /* syscall */
+ .skip 2 /* pop %r11 */
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE r11
+ .skip 1 /* pop %rcx */
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rcx
+ .align 0x20,0 /* ret */
+ .endr
+ CFI_ENDPROC
+
+#undef NEXT_PAGE
+
+ .data
+/* Just dummy symbol to allow compilation. Not used in sleep path */
+#ifdef CONFIG_ACPI_SLEEP
+ .align PAGE_SIZE
+ENTRY(wakeup_level4_pgt)
+ .fill 512,8,0
+#endif
+
+ .data
+
+ .align 16
+ .globl cpu_gdt_descr
+cpu_gdt_descr:
+ .word gdt_end-cpu_gdt_table-1
+gdt:
+ .quad cpu_gdt_table
+#ifdef CONFIG_SMP
+ .rept NR_CPUS-1
+ .word 0
+ .quad 0
+ .endr
+#endif
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+
+ .section .data.page_aligned, "aw"
+ .align PAGE_SIZE
+
+/* The TLS descriptors are currently at a different place compared to i386.
+ Hopefully nobody expects them at a fixed place (Wine?) */
+
+ENTRY(cpu_gdt_table)
+ .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9b000000ffff /* __KERNEL_CS */
+ .quad 0x00cf93000000ffff /* __KERNEL_DS */
+ .quad 0x00cffb000000ffff /* __USER32_CS */
+ .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
+ .quad 0x00affb000000ffff /* __USER_CS */
+ .quad 0x0 /* unused */
+ .quad 0,0 /* TSS */
+ .quad 0,0 /* LDT */
+ .quad 0,0,0 /* three TLS descriptors */
+ .quad 0x0000f40000000000 /* node/CPU stored in limit */
+gdt_end:
+ /* asm/segment.h:GDT_ENTRIES must match this */
+ /* This should be a multiple of the cache line size */
+ /* GDTs of other CPUs are now dynamically allocated */
+
+ /* zero the remaining page */
+ .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
+
+ .section .bss.page_aligned, "aw", @nobits
+ .align PAGE_SIZE
+ENTRY(empty_zero_page)
+ .skip PAGE_SIZE
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+/*
+ * __xen_guest information
+ */
+.macro utoh value
+ i = 64
+ .rept 16
+ i = i - 4
+ .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
+ .endr
+.endm
+
+.section __xen_guest
+ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
+ .ascii ",XEN_VER=xen-3.0"
+ .ascii ",VIRT_BASE=0x"
+ utoh __START_KERNEL_map
+ .ascii ",ELF_PADDR_OFFSET=0x"
+ utoh __START_KERNEL_map
+ .ascii ",VIRT_ENTRY=0x"
+ utoh (__START_KERNEL_map + __PHYSICAL_START)
+ .ascii ",HYPERCALL_PAGE=0x"
+ utoh (phys_hypercall_page >> PAGE_SHIFT)
+ .ascii ",FEATURES=writable_page_tables"
+ .ascii "|writable_descriptor_tables"
+ .ascii "|auto_translated_physmap"
+ .ascii "|supervisor_mode_kernel"
+ .ascii ",LOADER=generic"
+ .byte 0
+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
+
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad __START_KERNEL_map)
+#if CONFIG_XEN_COMPAT <= 0x030002
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad __START_KERNEL_map)
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad 0)
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT)
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
--- /dev/null
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+#ifdef CONFIG_X86_XEN
+#define swapper_pg_dir ((pgd_t *)NULL)
+#endif
+struct mm_struct init_mm = INIT_MM(init_mm);
+#undef swapper_pg_dir
+EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
+
+/*
+ * Initial thread structure.
+ *
+ * We need to make sure that this is THREAD_SIZE aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union
+ __attribute__((__section__(".data.init_task"))) =
+ { INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+EXPORT_SYMBOL(init_task);
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data.cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+#endif
+
--- /dev/null
+/*
+ * Intel IO-APIC support for multi-Pentium hosts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ * Many thanks to Stig Venaas for trying out countless experimental
+ * patches and reporting/debugging problems patiently!
+ *
+ * (c) 1999, Multiple IO-APIC support, developed by
+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
+ * and Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively
+ * Paul Diefenbaugh : Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+
+#include <mach_apic.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+
+/* Fake i8259 */
+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
+#define disable_8259A_irq(_irq) ((void)0)
+#define i8259A_irq_pending(_irq) (0)
+
+unsigned long io_apic_irqs;
+
+#define clear_IO_APIC() ((void)0)
+#else
+int (*ioapic_renumber_irq)(int ioapic, int irq);
+atomic_t irq_mis_count;
+#endif /* CONFIG_XEN */
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
+
+#ifndef CONFIG_XEN
+int timer_over_8254 __initdata = 1;
+#endif
+
+/*
+ * Is the SiS APIC rmw bug present ?
+ * -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+int nr_ioapics;
+
+/* MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+#ifndef CONFIG_XEN
+static int disable_timer_pin_1 __initdata;
+#endif
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+ int apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+#ifndef CONFIG_XEN
+struct io_apic {
+ unsigned int index;
+ unsigned int unused[3];
+ unsigned int data;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+#endif
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+#ifndef CONFIG_XEN
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ return readl(&io_apic->data);
+#else
+ struct physdev_apic apic_op;
+ int ret;
+
+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.reg = reg;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+ if (ret)
+ return ret;
+ return apic_op.value;
+#endif
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+#ifndef CONFIG_XEN
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+#else
+ struct physdev_apic apic_op;
+
+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.reg = reg;
+ apic_op.value = value;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
+#endif
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+ volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
+ if (sis_apic_bug)
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+}
+#else
+#define io_apic_modify io_apic_write
+#endif
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ union entry_union eu;
+ eu.entry = e;
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, e);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+ unsigned long flags;
+ union entry_union eu = { .entry.mask = 1 };
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+ static int first_free_entry = NR_IRQS;
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ while (entry->next)
+ entry = irq_2_pin + entry->next;
+
+ if (entry->pin != -1) {
+ entry->next = first_free_entry;
+ entry = irq_2_pin + entry->next;
+ if (++first_free_entry >= PIN_MAP_SIZE)
+ panic("io_apic.c: whoops");
+ }
+ entry->apic = apic;
+ entry->pin = pin;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ while (1) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ }
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+}
+
+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+{
+ struct irq_pin_list *entry = irq_2_pin + irq;
+ unsigned int pin, reg;
+
+ for (;;) {
+ pin = entry->pin;
+ if (pin == -1)
+ break;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ reg &= ~disable;
+ reg |= enable;
+ io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+}
+
+/* mask = 1 */
+static void __mask_IO_APIC_irq (unsigned int irq)
+{
+ __modify_IO_APIC_irq(irq, 0x00010000, 0);
+}
+
+/* mask = 0 */
+static void __unmask_IO_APIC_irq (unsigned int irq)
+{
+ __modify_IO_APIC_irq(irq, 0, 0x00010000);
+}
+
+/* mask = 1, trigger = 0 */
+static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+{
+ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+}
+
+/* mask = 0, trigger = 1 */
+static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+{
+ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+}
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __mask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __unmask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+ struct IO_APIC_route_entry entry;
+
+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.delivery_mode == dest_SMI)
+ return;
+
+ /*
+ * Disable it in the IO-APIC irq-routing table:
+ */
+ ioapic_mask_entry(apic, pin);
+}
+
+static void clear_IO_APIC (void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ clear_IO_APIC_pin(apic, pin);
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
+{
+ unsigned long flags;
+ int pin;
+ struct irq_pin_list *entry = irq_2_pin + irq;
+ unsigned int apicid_value;
+ cpumask_t tmp;
+
+ cpus_and(tmp, cpumask, cpu_online_map);
+ if (cpus_empty(tmp))
+ tmp = TARGET_CPUS;
+
+ cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
+ apicid_value = cpu_mask_to_apicid(cpumask);
+ /* Prepare to do the io_apic_write */
+ apicid_value = apicid_value << 24;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ for (;;) {
+ pin = entry->pin;
+ if (pin == -1)
+ break;
+ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+ irq_desc[irq].affinity = cpumask;
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#if defined(CONFIG_IRQBALANCE)
+# include <asm/processor.h> /* kernel_thread() */
+# include <linux/kernel_stat.h> /* kstat */
+# include <linux/slab.h> /* kmalloc() */
+# include <linux/timer.h>
+
+#define IRQBALANCE_CHECK_ARCH -999
+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
+#define BALANCED_IRQ_MORE_DELTA (HZ/10)
+#define BALANCED_IRQ_LESS_DELTA (HZ)
+
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
+
+static struct irq_cpu_info {
+ unsigned long * last_irq;
+ unsigned long * irq_delta;
+ unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+
+#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
+
+#define IDLE_ENOUGH(cpu,now) \
+ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
+
+#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
+
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
+
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+ [0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
+
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+ balance_irq_affinity[irq] = mask;
+}
+
+static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
+ unsigned long now, int direction)
+{
+ int search_idle = 1;
+ int cpu = curr_cpu;
+
+ goto inside;
+
+ do {
+ if (unlikely(cpu == curr_cpu))
+ search_idle = 0;
+inside:
+ if (direction == 1) {
+ cpu++;
+ if (cpu >= NR_CPUS)
+ cpu = 0;
+ } else {
+ cpu--;
+ if (cpu == -1)
+ cpu = NR_CPUS-1;
+ }
+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
+ (search_idle && !IDLE_ENOUGH(cpu,now)));
+
+ return cpu;
+}
+
+static inline void balance_irq(int cpu, int irq)
+{
+ unsigned long now = jiffies;
+ cpumask_t allowed_mask;
+ unsigned int new_cpu;
+
+ if (irqbalance_disabled)
+ return;
+
+ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
+ new_cpu = move(cpu, allowed_mask, now, 1);
+ if (cpu != new_cpu) {
+ set_pending_irq(irq, cpumask_of_cpu(new_cpu));
+ }
+}
+
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+ int i, j;
+
+ for_each_online_cpu(i) {
+ for (j = 0; j < NR_IRQS; j++) {
+ if (!irq_desc[j].action)
+ continue;
+ /* Is it a significant load ? */
+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+ useful_load_threshold)
+ continue;
+ balance_irq(i, j);
+ }
+ }
+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ return;
+}
+
+static void do_irq_balance(void)
+{
+ int i, j;
+ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+ unsigned long move_this_load = 0;
+ int max_loaded = 0, min_loaded = 0;
+ int load;
+ unsigned long useful_load_threshold = balanced_irq_interval + 10;
+ int selected_irq;
+ int tmp_loaded, first_attempt = 1;
+ unsigned long tmp_cpu_irq;
+ unsigned long imbalance = 0;
+ cpumask_t allowed_mask, target_cpu_mask, tmp;
+
+ for_each_possible_cpu(i) {
+ int package_index;
+ CPU_IRQ(i) = 0;
+ if (!cpu_online(i))
+ continue;
+ package_index = CPU_TO_PACKAGEINDEX(i);
+ for (j = 0; j < NR_IRQS; j++) {
+ unsigned long value_now, delta;
+ /* Is this an active IRQ or balancing disabled ? */
+ if (!irq_desc[j].action || irq_balancing_disabled(j))
+ continue;
+ if ( package_index == i )
+ IRQ_DELTA(package_index,j) = 0;
+ /* Determine the total count per processor per IRQ */
+ value_now = (unsigned long) kstat_cpu(i).irqs[j];
+
+ /* Determine the activity per processor per IRQ */
+ delta = value_now - LAST_CPU_IRQ(i,j);
+
+ /* Update last_cpu_irq[][] for the next time */
+ LAST_CPU_IRQ(i,j) = value_now;
+
+ /* Ignore IRQs whose rate is less than the clock */
+ if (delta < useful_load_threshold)
+ continue;
+ /* update the load for the processor or package total */
+ IRQ_DELTA(package_index,j) += delta;
+
+ /* Keep track of the higher numbered sibling as well */
+ if (i != package_index)
+ CPU_IRQ(i) += delta;
+ /*
+ * We have sibling A and sibling B in the package
+ *
+ * cpu_irq[A] = load for cpu A + load for cpu B
+ * cpu_irq[B] = load for cpu B
+ */
+ CPU_IRQ(package_index) += delta;
+ }
+ }
+ /* Find the least loaded processor package */
+ for_each_online_cpu(i) {
+ if (i != CPU_TO_PACKAGEINDEX(i))
+ continue;
+ if (min_cpu_irq > CPU_IRQ(i)) {
+ min_cpu_irq = CPU_IRQ(i);
+ min_loaded = i;
+ }
+ }
+ max_cpu_irq = ULONG_MAX;
+
+tryanothercpu:
+ /* Look for heaviest loaded processor.
+ * We may come back to get the next heaviest loaded processor.
+ * Skip processors with trivial loads.
+ */
+ tmp_cpu_irq = 0;
+ tmp_loaded = -1;
+ for_each_online_cpu(i) {
+ if (i != CPU_TO_PACKAGEINDEX(i))
+ continue;
+ if (max_cpu_irq <= CPU_IRQ(i))
+ continue;
+ if (tmp_cpu_irq < CPU_IRQ(i)) {
+ tmp_cpu_irq = CPU_IRQ(i);
+ tmp_loaded = i;
+ }
+ }
+
+ if (tmp_loaded == -1) {
+ /* In the case of small number of heavy interrupt sources,
+ * loading some of the cpus too much. We use Ingo's original
+ * approach to rotate them around.
+ */
+ if (!first_attempt && imbalance >= useful_load_threshold) {
+ rotate_irqs_among_cpus(useful_load_threshold);
+ return;
+ }
+ goto not_worth_the_effort;
+ }
+
+ first_attempt = 0; /* heaviest search */
+ max_cpu_irq = tmp_cpu_irq; /* load */
+ max_loaded = tmp_loaded; /* processor */
+ imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+
+ /* if imbalance is less than approx 10% of max load, then
+ * observe diminishing returns action. - quit
+ */
+ if (imbalance < (max_cpu_irq >> 3))
+ goto not_worth_the_effort;
+
+tryanotherirq:
+ /* if we select an IRQ to move that can't go where we want, then
+ * see if there is another one to try.
+ */
+ move_this_load = 0;
+ selected_irq = -1;
+ for (j = 0; j < NR_IRQS; j++) {
+ /* Is this an active IRQ? */
+ if (!irq_desc[j].action)
+ continue;
+ if (imbalance <= IRQ_DELTA(max_loaded,j))
+ continue;
+ /* Try to find the IRQ that is closest to the imbalance
+ * without going over.
+ */
+ if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+ move_this_load = IRQ_DELTA(max_loaded,j);
+ selected_irq = j;
+ }
+ }
+ if (selected_irq == -1) {
+ goto tryanothercpu;
+ }
+
+ imbalance = move_this_load;
+
+ /* For physical_balance case, we accumulated both load
+ * values in the one of the siblings cpu_irq[],
+ * to use the same code for physical and logical processors
+ * as much as possible.
+ *
+ * NOTE: the cpu_irq[] array holds the sum of the load for
+ * sibling A and sibling B in the slot for the lowest numbered
+ * sibling (A), _AND_ the load for sibling B in the slot for
+ * the higher numbered sibling.
+ *
+ * We seek the least loaded sibling by making the comparison
+ * (A+B)/2 vs B
+ */
+ load = CPU_IRQ(min_loaded) >> 1;
+ for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
+ if (load > CPU_IRQ(j)) {
+ /* This won't change cpu_sibling_map[min_loaded] */
+ load = CPU_IRQ(j);
+ min_loaded = j;
+ }
+ }
+
+ cpus_and(allowed_mask,
+ cpu_online_map,
+ balance_irq_affinity[selected_irq]);
+ target_cpu_mask = cpumask_of_cpu(min_loaded);
+ cpus_and(tmp, target_cpu_mask, allowed_mask);
+
+ if (!cpus_empty(tmp)) {
+ /* mark for change destination */
+ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
+ /* Since we made a change, come back sooner to
+ * check for more variation.
+ */
+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ return;
+ }
+ goto tryanotherirq;
+
+not_worth_the_effort:
+ /*
+ * if we did not find an IRQ to move, then adjust the time interval
+ * upward
+ */
+ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
+ return;
+}
+
+static int balanced_irq(void *unused)
+{
+ int i;
+ unsigned long prev_balance_time = jiffies;
+ long time_remaining = balanced_irq_interval;
+
+ /* push everything to CPU 0 to give us a starting point. */
+ for (i = 0 ; i < NR_IRQS ; i++) {
+ irq_desc[i].pending_mask = cpumask_of_cpu(0);
+ set_pending_irq(i, cpumask_of_cpu(0));
+ }
+
+ set_freezable();
+ for ( ; ; ) {
+ time_remaining = schedule_timeout_interruptible(time_remaining);
+ try_to_freeze();
+ if (time_after(jiffies,
+ prev_balance_time+balanced_irq_interval)) {
+ preempt_disable();
+ do_irq_balance();
+ prev_balance_time = jiffies;
+ time_remaining = balanced_irq_interval;
+ preempt_enable();
+ }
+ }
+ return 0;
+}
+
+static int __init balanced_irq_init(void)
+{
+ int i;
+ struct cpuinfo_x86 *c;
+ cpumask_t tmp;
+
+ cpus_shift_right(tmp, cpu_online_map, 2);
+ c = &boot_cpu_data;
+ /* When not overwritten by the command line ask subarchitecture. */
+ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
+ irqbalance_disabled = NO_BALANCE_IRQ;
+ if (irqbalance_disabled)
+ return 0;
+
+ /* disable irqbalance completely if there is only one processor online */
+ if (num_online_cpus() < 2) {
+ irqbalance_disabled = 1;
+ return 0;
+ }
+ /*
+ * Enable physical balance only if more than 1 physical processor
+ * is present
+ */
+ if (smp_num_siblings > 1 && !cpus_empty(tmp))
+ physical_balance = 1;
+
+ for_each_online_cpu(i) {
+ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
+ printk(KERN_ERR "balanced_irq_init: out of memory");
+ goto failed;
+ }
+ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
+ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
+ }
+
+ printk(KERN_INFO "Starting balanced_irq\n");
+ if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
+ return 0;
+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+failed:
+ for_each_possible_cpu(i) {
+ kfree(irq_cpu_data[i].irq_delta);
+ irq_cpu_data[i].irq_delta = NULL;
+ kfree(irq_cpu_data[i].last_irq);
+ irq_cpu_data[i].last_irq = NULL;
+ }
+ return 0;
+}
+
+int __devinit irqbalance_disable(char *str)
+{
+ irqbalance_disabled = 1;
+ return 1;
+}
+
+__setup("noirqbalance", irqbalance_disable);
+
+late_initcall(balanced_irq_init);
+#endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
+#endif
+
+#ifndef CONFIG_SMP
+void send_IPI_self(int vector)
+{
+#ifndef CONFIG_XEN
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write_around(APIC_ICR, cfg);
+#endif
+}
+#endif /* !CONFIG_SMP */
+
+
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+
+ pirqs_enabled = 1;
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mpc_irqtype == type &&
+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mpc_dstirq == pin)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].mpc_irqtype == type) &&
+ (mp_irqs[i].mpc_srcbusirq == irq))
+
+ return mp_irqs[i].mpc_dstirq;
+ }
+ return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].mpc_irqtype == type) &&
+ (mp_irqs[i].mpc_srcbusirq == irq))
+ break;
+ }
+ if (i < mp_irq_entries) {
+ int apic;
+ for(apic = 0; apic < nr_ioapics; apic++) {
+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ return apic;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+ int apic, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
+ "slot:%d, pin:%d.\n", bus, slot, pin);
+ if (mp_bus_id_to_pci_bus[bus] == -1) {
+ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ break;
+
+ if (!test_bit(lbus, mp_bus_not_pci) &&
+ !mp_irqs[i].mpc_irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+ if (!(apic || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ return irq;
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0)
+ best_guess = irq;
+ }
+ }
+ return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+#ifdef CONFIG_SMP
+#ifndef CONFIG_XEN
+void __init setup_ioapic_dest(void)
+{
+ int pin, ioapic, irq, irq_entry;
+
+ if (skip_ioapic_setup == 1)
+ return;
+
+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+ if (irq_entry == -1)
+ continue;
+ irq = pin_2_irq(irq_entry, ioapic, pin);
+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
+ }
+
+ }
+}
+#endif /* !CONFIG_XEN */
+#endif
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < 16) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+#endif
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (0)
+#define default_ISA_polarity(idx) (0)
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx) (1)
+#define default_PCI_polarity(idx) (1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx) (1)
+#define default_MCA_polarity(idx) default_ISA_polarity(idx)
+
+static int MPBIOS_polarity(int idx)
+{
+ int bus = mp_irqs[idx].mpc_srcbus;
+ int polarity;
+
+ /*
+ * Determine IRQ line polarity (high active or low active):
+ */
+ switch (mp_irqs[idx].mpc_irqflag & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent polarity */
+ {
+ polarity = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_polarity(idx):
+ default_PCI_polarity(idx);
+ break;
+ }
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ }
+ return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+ int bus = mp_irqs[idx].mpc_srcbus;
+ int trigger;
+
+ /*
+ * Determine IRQ trigger mode (edge or level sensitive):
+ */
+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent */
+ {
+ trigger = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_trigger(idx):
+ default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ switch (mp_bus_id_to_type[bus])
+ {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ trigger = default_EISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_MCA: /* MCA pin */
+ {
+ trigger = default_MCA_trigger(idx);
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ }
+#endif
+ break;
+ }
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
+ return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+ return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+ return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+ int irq, i;
+ int bus = mp_irqs[idx].mpc_srcbus;
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].mpc_dstirq != pin)
+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+ if (test_bit(bus, mp_bus_not_pci))
+ irq = mp_irqs[idx].mpc_srcbusirq;
+ else {
+ /*
+ * PCI IRQs are mapped in order
+ */
+ i = irq = 0;
+ while (i < apic)
+ irq += nr_ioapic_registers[i++];
+ irq += pin;
+
+#ifndef CONFIG_XEN
+ /*
+ * For MPS mode, so far only needed by ES7000 platform
+ */
+ if (ioapic_renumber_irq)
+ irq = ioapic_renumber_irq(apic, irq);
+#endif
+ }
+
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ }
+ }
+ }
+ return irq;
+}
+
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ int apic, idx, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ idx = find_irq_entry(apic,pin,mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+ return irq_trigger(idx);
+ }
+ }
+ /*
+ * nonexistent IRQs are edge default
+ */
+ return 0;
+}
+
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
+
+static int __assign_irq_vector(int irq)
+{
+ int vector;
+ struct physdev_irq irq_op;
+
+ BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
+
+ if (irq_vector[irq] > 0)
+ return irq_vector[irq];
+
+ irq_op.irq = irq;
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
+ return -ENOSPC;
+
+ vector = irq_op.vector;
+ irq_vector[irq] = vector;
+
+ return vector;
+}
+
+static int assign_irq_vector(int irq)
+{
+ unsigned long flags;
+ int vector;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ vector = __assign_irq_vector(irq);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ return vector;
+}
+
+#ifndef CONFIG_XEN
+static struct irq_chip ioapic_chip;
+
+#define IOAPIC_AUTO -1
+#define IOAPIC_EDGE 0
+#define IOAPIC_LEVEL 1
+
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL) {
+ irq_desc[irq].status |= IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &ioapic_chip,
+ handle_fasteoi_irq, "fasteoi");
+ } else {
+ irq_desc[irq].status &= ~IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &ioapic_chip,
+ handle_edge_irq, "edge");
+ }
+ set_intr_gate(vector, interrupt[irq]);
+}
+#else
+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
+#endif
+
+static void __init setup_IO_APIC_irqs(void)
+{
+ struct IO_APIC_route_entry entry;
+ int apic, pin, idx, irq, first_notcon = 1, vector;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+ /*
+ * add it to the IO-APIC irq-routing table:
+ */
+ memset(&entry,0,sizeof(entry));
+
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = INT_DEST_MODE;
+ entry.mask = 0; /* enable IRQ */
+ entry.dest.logical.logical_dest =
+ cpu_mask_to_apicid(TARGET_CPUS);
+
+ idx = find_irq_entry(apic,pin,mp_INT);
+ if (idx == -1) {
+ if (first_notcon) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ " IO-APIC (apicid-pin) %d-%d",
+ mp_ioapics[apic].mpc_apicid,
+ pin);
+ first_notcon = 0;
+ } else
+ apic_printk(APIC_VERBOSE, ", %d-%d",
+ mp_ioapics[apic].mpc_apicid, pin);
+ continue;
+ }
+
+ if (!first_notcon) {
+ apic_printk(APIC_VERBOSE, " not connected.\n");
+ first_notcon = 1;
+ }
+
+ entry.trigger = irq_trigger(idx);
+ entry.polarity = irq_polarity(idx);
+
+ if (irq_trigger(idx)) {
+ entry.trigger = 1;
+ entry.mask = 1;
+ }
+
+ irq = pin_2_irq(idx, apic, pin);
+ /*
+ * skip adding the timer int on secondary nodes, which causes
+ * a small but painful rift in the time-space continuum
+ */
+ if (multi_timer_check(apic, irq))
+ continue;
+ else
+ add_pin_to_irq(irq, apic, pin);
+
+ if (/*!apic &&*/ !IO_APIC_IRQ(irq))
+ continue;
+
+ if (IO_APIC_IRQ(irq)) {
+ vector = assign_irq_vector(irq);
+ entry.vector = vector;
+ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+
+ if (!apic && (irq < 16))
+ disable_8259A_irq(irq);
+ }
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+
+ if (!first_notcon)
+ apic_printk(APIC_VERBOSE, " not connected.\n");
+}
+
+/*
+ * Set up the 8259A-master output pin:
+ */
+#ifndef CONFIG_XEN
+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+{
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry,0,sizeof(entry));
+
+ disable_8259A_irq(0);
+
+ /* mask LVT0 */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+ /*
+ * We use logical delivery to get the timer IRQ
+ * to the first CPU.
+ */
+ entry.dest_mode = INT_DEST_MODE;
+ entry.mask = 0; /* unmask IRQ now */
+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.polarity = 0;
+ entry.trigger = 0;
+ entry.vector = vector;
+
+ /*
+ * The timer IRQ doesn't have to know that behind the
+ * scene we have a 8259A-master in AEOI mode ...
+ */
+ irq_desc[0].chip = &ioapic_chip;
+ set_irq_handler(0, handle_edge_irq);
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(apic, pin, entry);
+
+ enable_8259A_irq(0);
+}
+
+void __init print_IO_APIC(void)
+{
+ int apic, i;
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
+ unsigned long flags;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for (i = 0; i < nr_ioapics; i++)
+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+ /*
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
+ */
+ printk(KERN_INFO "testing the IO APIC.......................\n");
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ reg_01.raw = io_apic_read(apic, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(apic, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(apic, 3);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
+
+ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
+
+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ }
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+ " Stat Dest Deli Vect: \n");
+
+ for (i = 0; i <= reg_01.bits.entries; i++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapic_read_entry(apic, i);
+
+ printk(KERN_DEBUG " %02x %03X %02X ",
+ i,
+ entry.dest.logical.logical_dest,
+ entry.dest.physical.physical_dest
+ );
+
+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector
+ );
+ }
+ }
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ for (i = 0; i < NR_IRQS; i++) {
+ struct irq_pin_list *entry = irq_2_pin + i;
+ if (entry->pin < 0)
+ continue;
+ printk(KERN_DEBUG "IRQ%d ", i);
+ for (;;) {
+ printk("-> %d:%d", entry->apic, entry->pin);
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+ printk("\n");
+ }
+
+ printk(KERN_INFO ".................................... done.\n");
+
+ return;
+}
+
+static void print_APIC_bitfield (int base)
+{
+ unsigned int v;
+ int i, j;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+ for (i = 0; i < 8; i++) {
+ v = apic_read(base + i*0x10);
+ for (j = 0; j < 32; j++) {
+ if (v & (1<<j))
+ printk("1");
+ else
+ printk("0");
+ }
+ printk("\n");
+ }
+}
+
+void /*__init*/ print_local_APIC(void * dummy)
+{
+ unsigned int v, ver, maxlvt;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
+ GET_APIC_ID(read_apic_id()));
+ v = apic_read(APIC_LVR);
+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ }
+
+ v = apic_read(APIC_EOI);
+ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ v = apic_read(APIC_LDR);
+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ v = apic_read(APIC_SPIV);
+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+ printk(KERN_DEBUG "... APIC ISR field:\n");
+ print_APIC_bitfield(APIC_ISR);
+ printk(KERN_DEBUG "... APIC TMR field:\n");
+ print_APIC_bitfield(APIC_TMR);
+ printk(KERN_DEBUG "... APIC IRR field:\n");
+ print_APIC_bitfield(APIC_IRR);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_ICR);
+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+ v = apic_read(APIC_ICR2);
+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+ v = apic_read(APIC_LVTT);
+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) { /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) { /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+ printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+ on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void /*__init*/ print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
+
+ outb(0x0b,0xa0);
+ outb(0x0b,0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a,0xa0);
+ outb(0x0a,0x20);
+
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+#endif /* !CONFIG_XEN */
+
+static void __init enable_IO_APIC(void)
+{
+ union IO_APIC_reg_01 reg_01;
+ int i8259_apic, i8259_pin;
+ int i, apic;
+ unsigned long flags;
+
+ for (i = 0; i < PIN_MAP_SIZE; i++) {
+ irq_2_pin[i].pin = -1;
+ irq_2_pin[i].next = 0;
+ }
+ if (!pirqs_enabled)
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+
+ /*
+ * The number of IO-APIC IRQ registers (== #pins):
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(apic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+ }
+ for(apic = 0; apic < nr_ioapics; apic++) {
+ int pin;
+ /* See if any of the pins is in ExtINT mode */
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ struct IO_APIC_route_entry entry;
+ entry = ioapic_read_entry(apic, pin);
+
+
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
+ }
+ }
+ }
+ found_i8259:
+ /* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+ /* Trust the MP table if nothing is setup in the hardware */
+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ ioapic_i8259.pin = i8259_pin;
+ ioapic_i8259.apic = i8259_apic;
+ }
+ /* Complain if the MP table and the hardware disagree */
+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ {
+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+ }
+
+ /*
+ * Do not trust the IO-APIC being empty at bootup
+ */
+ clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+ /*
+ * Clear the IO-APIC before rebooting:
+ */
+ clear_IO_APIC();
+
+#ifndef CONFIG_XEN
+ /*
+ * If the i8259 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrupts can be delivered.
+ */
+ if (ioapic_i8259.pin != -1) {
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
+ entry.vector = 0;
+ entry.dest.physical.physical_dest =
+ GET_APIC_ID(read_apic_id());
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+ }
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+#endif
+}
+
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+
+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int apic;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ /* Read the register 0 value */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mp_ioapics[apic].mpc_apicid;
+
+ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ apic, mp_ioapics[apic].mpc_apicid);
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(phys_id_present_map,
+ mp_ioapics[apic].mpc_apicid)) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ apic, mp_ioapics[apic].mpc_apicid);
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ mp_ioapics[apic].mpc_apicid = i;
+ } else {
+ physid_mask_t tmp;
+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mp_ioapics[apic].mpc_apicid);
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mp_ioapics[apic].mpc_apicid)
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mpc_dstapic == old_id)
+ mp_irqs[i].mpc_dstapic
+ = mp_ioapics[apic].mpc_apicid;
+
+ /*
+ * Read the right value from the MPC table and
+ * write it into the ID register.
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mp_ioapics[apic].mpc_apicid);
+
+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0, reg_00.raw);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+ printk("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+#else
+static void __init setup_ioapic_ids_from_mpc(void) { }
+#endif
+
+#ifndef CONFIG_XEN
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ * - timer IRQ defaults to IO-APIC IRQ
+ * - if this function detects that timer IRQs are defunct, then we fall
+ * back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+ unsigned long t1 = jiffies;
+ unsigned long flags;
+
+ if (no_timer_check)
+ return 1;
+
+ local_save_flags(flags);
+ local_irq_enable();
+ /* Let ten ticks pass... */
+ mdelay((10 * 1000) / HZ);
+ local_irq_restore(flags);
+
+ /*
+ * Expect a few ticks at least, to be sure some possible
+ * glue logic does not lock up after one or two first
+ * ticks in a non-ExtINT mode. Also the local APIC
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+ if (time_after(jiffies, t1 + 4))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Startup quirk:
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ *
+ * (We do this for level-triggered IRQs too - it cannot hurt.)
+ */
+static unsigned int startup_ioapic_irq(unsigned int irq)
+{
+ int was_pending = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < 16) {
+ disable_8259A_irq(irq);
+ if (i8259A_irq_pending(irq))
+ was_pending = 1;
+ }
+ __unmask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+}
+
+static void ack_ioapic_irq(unsigned int irq)
+{
+ move_native_irq(irq);
+ ack_APIC_irq();
+}
+
+static void ack_ioapic_quirk_irq(unsigned int irq)
+{
+ unsigned long v;
+ int i;
+
+ move_native_irq(irq);
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ */
+ i = irq_vector[irq];
+
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+ ack_APIC_irq();
+
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+ spin_lock(&ioapic_lock);
+ __mask_and_edge_IO_APIC_irq(irq);
+ __unmask_and_level_IO_APIC_irq(irq);
+ spin_unlock(&ioapic_lock);
+ }
+}
+
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+ send_IPI_self(irq_vector[irq]);
+
+ return 1;
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+ .name = "IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_ioapic_irq,
+ .eoi = ack_ioapic_quirk_irq,
+#ifdef CONFIG_SMP
+ .set_affinity = set_ioapic_affinity_irq,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+#endif /* !CONFIG_XEN */
+
+static inline void init_IO_APIC_traps(void)
+{
+ int irq;
+
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ for (irq = 0; irq < NR_IRQS ; irq++) {
+ if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
+ /*
+ * Hmm.. We don't have an entry for this,
+ * so default to an old-fashioned 8259
+ * interrupt if we can..
+ */
+ if (irq < 16)
+ make_8259A_irq(irq);
+#ifndef CONFIG_XEN
+ else
+ /* Strange. Oh, well.. */
+ irq_desc[irq].chip = &no_irq_chip;
+#endif
+ }
+ }
+}
+
+#ifndef CONFIG_XEN
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void ack_apic(unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+static void mask_lapic_irq (unsigned int irq)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq (unsigned int irq)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+ .name = "local-APIC-edge",
+ .mask = mask_lapic_irq,
+ .unmask = unmask_lapic_irq,
+ .eoi = ack_apic,
+};
+
+static void __init setup_nmi(void)
+{
+ /*
+ * Dirty trick to enable the NMI watchdog ...
+ * We put the 8259A master into AEOI mode and
+ * unmask on all local APICs LVT0 as NMI.
+ *
+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+ * is from Maciej W. Rozycki - so we do not have to EOI from
+ * the NMI handler or the timer interrupt.
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
+
+ enable_NMI_through_LVT0();
+
+ apic_printk(APIC_VERBOSE, " done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
+ * not support the ExtINT mode, unfortunately. We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA. --macro
+ */
+static inline void __init unlock_ExtINT_logic(void)
+{
+ int apic, pin, i;
+ struct IO_APIC_route_entry entry0, entry1;
+ unsigned char save_control, save_freq_select;
+
+ pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ apic = find_isa_irq_apic(8, mp_INT);
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ entry0 = ioapic_read_entry(apic, pin);
+ clear_IO_APIC_pin(apic, pin);
+
+ memset(&entry1, 0, sizeof(entry1));
+
+ entry1.dest_mode = 0; /* physical delivery */
+ entry1.mask = 0; /* unmask IRQ now */
+ entry1.dest.physical.physical_dest = hard_smp_processor_id();
+ entry1.delivery_mode = dest_ExtINT;
+ entry1.polarity = entry0.polarity;
+ entry1.trigger = 0;
+ entry1.vector = 0;
+
+ ioapic_write_entry(apic, pin, entry1);
+
+ save_control = CMOS_READ(RTC_CONTROL);
+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+ RTC_FREQ_SELECT);
+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+ i = 100;
+ while (i-- > 0) {
+ mdelay(10);
+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+ i -= 10;
+ }
+
+ CMOS_WRITE(save_control, RTC_CONTROL);
+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+ clear_IO_APIC_pin(apic, pin);
+
+ ioapic_write_entry(apic, pin, entry0);
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void __init check_timer(void)
+{
+ int apic1, pin1, apic2, pin2;
+ int vector;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ /*
+ * get/set the timer IRQ vector:
+ */
+ disable_8259A_irq(0);
+ vector = assign_irq_vector(0);
+ set_intr_gate(vector, interrupt[0]);
+
+ /*
+ * Subtle, code in do_timer_interrupt() expects an AEOI
+ * mode for the 8259A whenever interrupts are routed
+ * through I/O APICs. Also IRQ0 has to be enabled in
+ * the 8259A which implies the virtual wire has to be
+ * disabled in the local APIC.
+ */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ init_8259A(1);
+ timer_ack = 1;
+ if (timer_over_8254 > 0)
+ enable_8259A_irq(0);
+
+ pin1 = find_isa_irq_pin(0, mp_INT);
+ apic1 = find_isa_irq_apic(0, mp_INT);
+ pin2 = ioapic_i8259.pin;
+ apic2 = ioapic_i8259.apic;
+
+ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ vector, apic1, pin1, apic2, pin2);
+
+ if (pin1 != -1) {
+ /*
+ * Ok, does IRQ0 through the IOAPIC work?
+ */
+ unmask_IO_APIC_irq(0);
+ if (timer_irq_works()) {
+ if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
+ setup_nmi();
+ enable_8259A_irq(0);
+ }
+ if (disable_timer_pin_1 > 0)
+ clear_IO_APIC_pin(0, pin1);
+ goto out;
+ }
+ clear_IO_APIC_pin(apic1, pin1);
+ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
+ "IO-APIC\n");
+ }
+
+ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
+ if (pin2 != -1) {
+ printk("\n..... (found pin %d) ...", pin2);
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+ setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+ if (timer_irq_works()) {
+ printk("works.\n");
+ if (pin1 != -1)
+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ else
+ add_pin_to_irq(0, apic2, pin2);
+ if (nmi_watchdog == NMI_IO_APIC) {
+ setup_nmi();
+ }
+ goto out;
+ }
+ /*
+ * Cleanup, just in case ...
+ */
+ clear_IO_APIC_pin(apic2, pin2);
+ }
+ printk(" failed.\n");
+
+ if (nmi_watchdog == NMI_IO_APIC) {
+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+ nmi_watchdog = 0;
+ }
+
+ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+ disable_8259A_irq(0);
+ set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
+ "fasteoi");
+ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
+ enable_8259A_irq(0);
+
+ if (timer_irq_works()) {
+ printk(" works.\n");
+ goto out;
+ }
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+ printk(" failed.\n");
+
+ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+ timer_ack = 0;
+ init_8259A(0);
+ make_8259A_irq(0);
+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+
+ unlock_ExtINT_logic();
+
+ if (timer_irq_works()) {
+ printk(" works.\n");
+ goto out;
+ }
+ printk(" failed :(.\n");
+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
+ "report. Then try booting with the 'noapic' option");
+out:
+ local_irq_restore(flags);
+}
+#else
+int timer_uses_ioapic_pin_0 = 0;
+#define check_timer() ((void)0)
+#endif
+
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ * Linux doesn't really care, as it's not actually used
+ * for any interrupt handling anyway.
+ */
+#define PIC_IRQS (1 << PIC_CASCADE_IR)
+
+void __init setup_IO_APIC(void)
+{
+#ifndef CONFIG_XEN
+ int i;
+
+ /* Reserve all the system vectors. */
+ for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
+ set_bit(i, used_vectors);
+#endif
+
+ enable_IO_APIC();
+
+ if (acpi_ioapic)
+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
+ else
+ io_apic_irqs = ~PIC_IRQS;
+
+ printk("ENABLING IO-APIC IRQs\n");
+
+ /*
+ * Set up IO-APIC IRQ routing.
+ */
+ if (!acpi_ioapic)
+ setup_ioapic_ids_from_mpc();
+#ifndef CONFIG_XEN
+ sync_Arb_IDs();
+#endif
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ check_timer();
+ if (!acpi_ioapic)
+ print_IO_APIC();
+}
+
+#ifndef CONFIG_XEN
+static int __init setup_disable_8254_timer(char *s)
+{
+ timer_over_8254 = -1;
+ return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+ timer_over_8254 = 2;
+ return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+#endif
+
+/*
+ * Called after all the initialization is done. If we didnt find any
+ * APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+ if(sis_apic_bug == -1)
+ sis_apic_bug = 0;
+ if (is_initial_xendomain()) {
+ struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
+ op.u.platform_quirk.quirk_id = sis_apic_bug ?
+ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
+ VOID(HYPERVISOR_platform_op(&op));
+ }
+ return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
+struct sysfs_ioapic_data {
+ struct sys_device dev;
+ struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+ struct IO_APIC_route_entry *entry;
+ struct sysfs_ioapic_data *data;
+ int i;
+
+ data = container_of(dev, struct sysfs_ioapic_data, dev);
+ entry = data->entry;
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ entry[i] = ioapic_read_entry(dev->id, i);
+
+ return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+ struct IO_APIC_route_entry *entry;
+ struct sysfs_ioapic_data *data;
+ unsigned long flags;
+ union IO_APIC_reg_00 reg_00;
+ int i;
+
+ data = container_of(dev, struct sysfs_ioapic_data, dev);
+ entry = data->entry;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(dev->id, 0);
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ io_apic_write(dev->id, 0, reg_00.raw);
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ ioapic_write_entry(dev->id, i, entry[i]);
+
+ return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+ .name = "ioapic",
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+ struct sys_device * dev;
+ int i, size, error = 0;
+
+ error = sysdev_class_register(&ioapic_sysdev_class);
+ if (error)
+ return error;
+
+ for (i = 0; i < nr_ioapics; i++ ) {
+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+ * sizeof(struct IO_APIC_route_entry);
+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+ if (!mp_ioapic_data[i]) {
+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+ continue;
+ }
+ memset(mp_ioapic_data[i], 0, size);
+ dev = &mp_ioapic_data[i]->dev;
+ dev->id = i;
+ dev->cls = &ioapic_sysdev_class;
+ error = sysdev_register(dev);
+ if (error) {
+ kfree(mp_ioapic_data[i]);
+ mp_ioapic_data[i] = NULL;
+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+ continue;
+ }
+ }
+
+ return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+#ifndef CONFIG_XEN
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+ /* Allocate an unused irq */
+ int irq, new, vector = 0;
+ unsigned long flags;
+
+ irq = -ENOSPC;
+ spin_lock_irqsave(&vector_lock, flags);
+ for (new = (NR_IRQS - 1); new >= 0; new--) {
+ if (platform_legacy_irq(new))
+ continue;
+ if (irq_vector[new] != 0)
+ continue;
+ vector = __assign_irq_vector(new);
+ if (likely(vector > 0))
+ irq = new;
+ break;
+ }
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (irq >= 0) {
+ set_intr_gate(vector, interrupt[irq]);
+ dynamic_irq_init(irq);
+ }
+ return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+ unsigned long flags;
+
+ dynamic_irq_cleanup(irq);
+
+ spin_lock_irqsave(&vector_lock, flags);
+ clear_bit(irq_vector[irq], used_vectors);
+ irq_vector[irq] = 0;
+ spin_unlock_irqrestore(&vector_lock, flags);
+}
+#endif
+
+/*
+ * MSI message composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+ int vector;
+ unsigned dest;
+
+ vector = assign_irq_vector(irq);
+ if (vector >= 0) {
+ dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((INT_DEST_MODE == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL:
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU:
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED:
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(vector);
+ }
+ return vector;
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct msi_msg msg;
+ unsigned int dest;
+ cpumask_t tmp;
+ int vector;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ tmp = TARGET_CPUS;
+
+ vector = assign_irq_vector(irq);
+ if (vector < 0)
+ return;
+
+ dest = cpu_mask_to_apicid(mask);
+
+ read_msi_msg(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ write_msi_msg(irq, &msg);
+ irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+ .name = "PCI-MSI",
+ .unmask = unmask_msi_irq,
+ .mask = mask_msi_irq,
+ .ack = ack_ioapic_irq,
+#ifdef CONFIG_SMP
+ .set_affinity = set_msi_irq_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+ struct msi_msg msg;
+ int irq, ret;
+ irq = create_irq();
+ if (irq < 0)
+ return irq;
+
+ ret = msi_compose_msg(dev, irq, &msg);
+ if (ret < 0) {
+ destroy_irq(irq);
+ return ret;
+ }
+
+ set_irq_msi(irq, desc);
+ write_msi_msg(irq, &msg);
+
+ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
+ "edge");
+
+ return 0;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+ destroy_irq(irq);
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest)
+{
+ struct ht_irq_msg msg;
+ fetch_ht_irq_msg(irq, &msg);
+
+ msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
+ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+ msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
+ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+ write_ht_irq_msg(irq, &msg);
+}
+
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ tmp = TARGET_CPUS;
+
+ cpus_and(mask, tmp, CPU_MASK_ALL);
+
+ dest = cpu_mask_to_apicid(mask);
+
+ target_ht_irq(irq, dest);
+ irq_desc[irq].affinity = mask;
+}
+#endif
+
+static struct irq_chip ht_irq_chip = {
+ .name = "PCI-HT",
+ .mask = mask_ht_irq,
+ .unmask = unmask_ht_irq,
+ .ack = ack_ioapic_irq,
+#ifdef CONFIG_SMP
+ .set_affinity = set_ht_irq_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+ int vector;
+
+ vector = assign_irq_vector(irq);
+ if (vector >= 0) {
+ struct ht_irq_msg msg;
+ unsigned dest;
+ cpumask_t tmp;
+
+ cpus_clear(tmp);
+ cpu_set(vector >> 8, tmp);
+ dest = cpu_mask_to_apicid(tmp);
+
+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+ msg.address_lo =
+ HT_IRQ_LOW_BASE |
+ HT_IRQ_LOW_DEST_ID(dest) |
+ HT_IRQ_LOW_VECTOR(vector) |
+ ((INT_DEST_MODE == 0) ?
+ HT_IRQ_LOW_DM_PHYSICAL :
+ HT_IRQ_LOW_DM_LOGICAL) |
+ HT_IRQ_LOW_RQEOI_EDGE |
+ ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ HT_IRQ_LOW_MT_FIXED :
+ HT_IRQ_LOW_MT_ARBITRATED) |
+ HT_IRQ_LOW_IRQ_MASKED;
+
+ write_ht_irq_msg(irq, &msg);
+
+ set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+ handle_edge_irq, "edge");
+ }
+ return vector;
+}
+#endif /* CONFIG_HT_IRQ */
+
+/* --------------------------------------------------------------------------
+ ACPI-based IOAPIC Configuration
+ -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+#ifndef CONFIG_XEN
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+ physid_mask_t tmp;
+ unsigned long flags;
+ int i = 0;
+
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= get_physical_broadcast()) {
+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(apic_id_map, apic_id)) {
+
+ for (i = 0; i < get_physical_broadcast(); i++) {
+ if (!check_apicid_used(apic_id_map, i))
+ break;
+ }
+
+ if (i == get_physical_broadcast())
+ panic("Max apic_id exceeded!\n");
+
+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ tmp = apicid_to_cpu_present(apic_id);
+ physids_or(apic_id_map, apic_id_map, tmp);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+ return -1;
+ }
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+#endif /* !CONFIG_XEN */
+
+ return apic_id;
+}
+
+
+int __init io_apic_get_version (int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.version;
+}
+
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+{
+ struct IO_APIC_route_entry entry;
+
+ if (!IO_APIC_IRQ(irq)) {
+ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+ ioapic);
+ return -EINVAL;
+ }
+
+ /*
+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+ * Note that we mask (disable) IRQs now -- these get enabled when the
+ * corresponding device driver registers for this IRQ.
+ */
+
+ memset(&entry,0,sizeof(entry));
+
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = INT_DEST_MODE;
+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.trigger = edge_level;
+ entry.polarity = active_high_low;
+ entry.mask = 1;
+
+ /*
+ * IRQs < 16 are already in the irq_2_pin[] map
+ */
+ if (irq >= 16)
+ add_pin_to_irq(irq, ioapic, pin);
+
+ entry.vector = assign_irq_vector(irq);
+
+ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
+ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+ edge_level, active_high_low);
+
+ ioapic_register_intr(irq, entry.vector, edge_level);
+
+ if (!ioapic && (irq < 16))
+ disable_8259A_irq(irq);
+
+ ioapic_write_entry(ioapic, pin, entry);
+
+ return 0;
+}
+
+int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+{
+ int i;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mpc_irqtype == mp_INT &&
+ mp_irqs[i].mpc_srcbusirq == bus_irq)
+ break;
+ if (i >= mp_irq_entries)
+ return -1;
+
+ *trigger = irq_trigger(i);
+ *polarity = irq_polarity(i);
+ return 0;
+}
+
+#endif /* CONFIG_ACPI */
+
+#ifndef CONFIG_XEN
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+ disable_timer_pin_1 = -1;
+ return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+#endif
+
+static int __init parse_noapic(char *arg)
+{
+ /* disable IO-APIC */
+ disable_ioapic_setup();
+ return 0;
+}
+early_param("noapic", parse_noapic);
--- /dev/null
+/*
+ * Intel IO-APIC support for multi-Pentium hosts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ * Many thanks to Stig Venaas for trying out countless experimental
+ * patches and reporting/debugging problems patiently!
+ *
+ * (c) 1999, Multiple IO-APIC support, developed by
+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
+ * and Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively
+ * Paul Diefenbaugh : Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/sysdev.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#include <linux/dmar.h>
+#include <linux/jiffies.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+#include <linux/bootmem.h>
+
+#include <asm/idle.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+
+#include <mach_ipi.h>
+#include <mach_apic.h>
+
+struct irq_cfg {
+#ifndef CONFIG_XEN
+ cpumask_t domain;
+ cpumask_t old_domain;
+#endif
+ unsigned move_cleanup_count;
+ u8 vector;
+ u8 move_in_progress : 1;
+};
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
+
+static int assign_irq_vector(int irq, cpumask_t mask);
+
+#define __apicdebuginit __init
+
+int sis_apic_bug; /* not actually supported, dummy for compile */
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+
+/* Fake i8259 */
+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
+#define disable_8259A_irq(_irq) ((void)0)
+#define i8259A_irq_pending(_irq) (0)
+
+unsigned long io_apic_irqs;
+
+#define clear_IO_APIC() ((void)0)
+#else
+static int no_timer_check;
+
+static int disable_timer_pin_1 __initdata;
+
+int timer_over_8254 __initdata = 1;
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+#endif
+
+static DEFINE_SPINLOCK(ioapic_lock);
+DEFINE_SPINLOCK(vector_lock);
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+int nr_ioapics;
+
+/* MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+ short apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+#ifndef CONFIG_XEN
+struct io_apic {
+ unsigned int index;
+ unsigned int unused[3];
+ unsigned int data;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+#endif
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+#ifndef CONFIG_XEN
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ return readl(&io_apic->data);
+#else
+ struct physdev_apic apic_op;
+ int ret;
+
+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.reg = reg;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+ if (ret)
+ return ret;
+ return apic_op.value;
+#endif
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+#ifndef CONFIG_XEN
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+#else
+ struct physdev_apic apic_op;
+
+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.reg = reg;
+ apic_op.value = value;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
+#endif
+}
+
+#ifdef CONFIG_XEN
+#define io_apic_modify io_apic_write
+#else
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int value)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(value, &io_apic->data);
+}
+
+static bool io_apic_level_ack_pending(unsigned int irq)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ entry = irq_2_pin + irq;
+ for (;;) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ if (pin == -1)
+ break;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if ((reg >> 14) & 1) {
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+#endif
+
+/*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+static inline void io_apic_sync(unsigned int apic)
+{
+#ifndef CONFIG_XEN
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ readl(&io_apic->data);
+#endif
+}
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ union entry_union eu;
+ eu.entry = e;
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, e);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+ unsigned long flags;
+ union entry_union eu = { .entry.mask = 1 };
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifdef CONFIG_SMP
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+ int apic, pin;
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ BUG_ON(irq >= NR_IRQS);
+ for (;;) {
+ unsigned int reg;
+ apic = entry->apic;
+ pin = entry->pin;
+ if (pin == -1)
+ break;
+ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~0x000000ff;
+ reg |= vector;
+ io_apic_modify(apic, reg);
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+}
+
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ unsigned long flags;
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ /*
+ * Only the high 8 bits are valid.
+ */
+ dest = SET_APIC_LOGICAL_ID(dest);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __target_IO_APIC_irq(irq, dest, cfg->vector);
+ irq_desc[irq].affinity = mask;
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+#endif
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+ static int first_free_entry = NR_IRQS;
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ BUG_ON(irq >= NR_IRQS);
+ while (entry->next)
+ entry = irq_2_pin + entry->next;
+
+ if (entry->pin != -1) {
+ entry->next = first_free_entry;
+ entry = irq_2_pin + entry->next;
+ if (++first_free_entry >= PIN_MAP_SIZE)
+ panic("io_apic.c: ran out of irq_2_pin entries!");
+ }
+ entry->apic = apic;
+ entry->pin = pin;
+}
+
+#ifndef CONFIG_XEN
+#define __DO_ACTION(R, ACTION, FINAL) \
+ \
+{ \
+ int pin; \
+ struct irq_pin_list *entry = irq_2_pin + irq; \
+ \
+ BUG_ON(irq >= NR_IRQS); \
+ for (;;) { \
+ unsigned int reg; \
+ pin = entry->pin; \
+ if (pin == -1) \
+ break; \
+ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
+ reg ACTION; \
+ io_apic_modify(entry->apic, reg); \
+ FINAL; \
+ if (!entry->next) \
+ break; \
+ entry = irq_2_pin + entry->next; \
+ } \
+}
+
+#define DO_ACTION(name,R,ACTION, FINAL) \
+ \
+ static void name##_IO_APIC_irq (unsigned int irq) \
+ __DO_ACTION(R, ACTION, FINAL)
+
+DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
+ /* mask = 1 */
+DO_ACTION( __unmask, 0, &= 0xfffeffff, )
+ /* mask = 0 */
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __mask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __unmask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+ struct IO_APIC_route_entry entry;
+
+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.delivery_mode == dest_SMI)
+ return;
+ /*
+ * Disable it in the IO-APIC irq-routing table:
+ */
+ ioapic_mask_entry(apic, pin);
+}
+
+static void clear_IO_APIC (void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ clear_IO_APIC_pin(apic, pin);
+}
+
+#endif /* !CONFIG_XEN */
+
+int skip_ioapic_setup;
+int ioapic_force;
+
+static int __init parse_noapic(char *str)
+{
+ disable_ioapic_setup();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+#ifndef CONFIG_XEN
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 1;
+}
+__setup("disable_timer_pin_1", disable_timer_pin_setup);
+
+static int __init setup_disable_8254_timer(char *s)
+{
+ timer_over_8254 = -1;
+ return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+ timer_over_8254 = 2;
+ return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+#endif /* !CONFIG_XEN */
+
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mpc_irqtype == type &&
+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mpc_dstirq == pin)
+ return i;
+
+ return -1;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].mpc_irqtype == type) &&
+ (mp_irqs[i].mpc_srcbusirq == irq))
+
+ return mp_irqs[i].mpc_dstirq;
+ }
+ return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].mpc_irqtype == type) &&
+ (mp_irqs[i].mpc_srcbusirq == irq))
+ break;
+ }
+ if (i < mp_irq_entries) {
+ int apic;
+ for(apic = 0; apic < nr_ioapics; apic++) {
+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ return apic;
+ }
+ }
+
+ return -1;
+}
+#endif
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+ int apic, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (mp_bus_id_to_pci_bus[bus] == -1) {
+ apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ break;
+
+ if (!test_bit(lbus, mp_bus_not_pci) &&
+ !mp_irqs[i].mpc_irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+ if (!(apic || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ return irq;
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0)
+ best_guess = irq;
+ }
+ }
+ BUG_ON(best_guess >= NR_IRQS);
+ return best_guess;
+}
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (0)
+#define default_ISA_polarity(idx) (0)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx) (1)
+#define default_PCI_polarity(idx) (1)
+
+static int MPBIOS_polarity(int idx)
+{
+ int bus = mp_irqs[idx].mpc_srcbus;
+ int polarity;
+
+ /*
+ * Determine IRQ line polarity (high active or low active):
+ */
+ switch (mp_irqs[idx].mpc_irqflag & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent polarity */
+ if (test_bit(bus, mp_bus_not_pci))
+ polarity = default_ISA_polarity(idx);
+ else
+ polarity = default_PCI_polarity(idx);
+ break;
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ }
+ return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+ int bus = mp_irqs[idx].mpc_srcbus;
+ int trigger;
+
+ /*
+ * Determine IRQ trigger mode (edge or level sensitive):
+ */
+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent */
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
+ break;
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
+ return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+ return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+ return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+ int irq, i;
+ int bus = mp_irqs[idx].mpc_srcbus;
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].mpc_dstirq != pin)
+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+ if (test_bit(bus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].mpc_srcbusirq;
+ } else {
+ /*
+ * PCI IRQs are mapped in order
+ */
+ i = irq = 0;
+ while (i < apic)
+ irq += nr_ioapic_registers[i++];
+ irq += pin;
+ }
+ BUG_ON(irq >= NR_IRQS);
+ return irq;
+}
+
+static int __assign_irq_vector(int irq, cpumask_t mask)
+{
+ struct physdev_irq irq_op;
+ struct irq_cfg *cfg;
+
+ BUG_ON((unsigned)irq >= NR_IRQS);
+ cfg = &irq_cfg[irq];
+
+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ return -EBUSY;
+
+ if (cfg->vector)
+ return 0;
+
+ irq_op.irq = irq;
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
+ return -ENOSPC;
+
+ cfg->vector = irq_op.vector;
+
+ return 0;
+}
+
+static int assign_irq_vector(int irq, cpumask_t mask)
+{
+ int err;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector(irq, mask);
+ spin_unlock_irqrestore(&vector_lock, flags);
+ return err;
+}
+
+#ifndef CONFIG_XEN
+static void __clear_irq_vector(int irq)
+{
+ struct irq_cfg *cfg;
+ cpumask_t mask;
+ int cpu, vector;
+
+ BUG_ON((unsigned)irq >= NR_IRQS);
+ cfg = &irq_cfg[irq];
+ BUG_ON(!cfg->vector);
+
+ vector = cfg->vector;
+ cpus_and(mask, cfg->domain, cpu_online_map);
+ for_each_cpu_mask(cpu, mask)
+ per_cpu(vector_irq, cpu)[vector] = -1;
+
+ cfg->vector = 0;
+ cpus_clear(cfg->domain);
+}
+
+void __setup_vector_irq(int cpu)
+{
+ /* Initialize vector_irq on a new cpu */
+ /* This function must be called with vector_lock held */
+ int irq, vector;
+
+ /* Mark the inuse vectors */
+ for (irq = 0; irq < NR_IRQS; ++irq) {
+ if (!cpu_isset(cpu, irq_cfg[irq].domain))
+ continue;
+ vector = irq_cfg[irq].vector;
+ per_cpu(vector_irq, cpu)[vector] = irq;
+ }
+ /* Mark the free vectors */
+ for (vector = 0; vector < NR_VECTORS; ++vector) {
+ irq = per_cpu(vector_irq, cpu)[vector];
+ if (irq < 0)
+ continue;
+ if (!cpu_isset(cpu, irq_cfg[irq].domain))
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ }
+}
+
+static struct irq_chip ioapic_chip;
+
+static void ioapic_register_intr(int irq, unsigned long trigger)
+{
+ if (trigger) {
+ irq_desc[irq].status |= IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &ioapic_chip,
+ handle_fasteoi_irq, "fasteoi");
+ } else {
+ irq_desc[irq].status &= ~IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &ioapic_chip,
+ handle_edge_irq, "edge");
+ }
+}
+#else
+#define ioapic_register_intr(irq,trigger) ((void)0)
+#endif /* !CONFIG_XEN */
+
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+ int trigger, int polarity)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ struct IO_APIC_route_entry entry;
+ cpumask_t mask;
+
+ if (!IO_APIC_IRQ(irq))
+ return;
+
+ mask = TARGET_CPUS;
+ if (assign_irq_vector(irq, mask))
+ return;
+
+#ifndef CONFIG_XEN
+ cpus_and(mask, cfg->domain, mask);
+#endif
+
+ apic_printk(APIC_VERBOSE,KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+ "IRQ %d Mode:%i Active:%i)\n",
+ apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+ irq, trigger, polarity);
+
+ /*
+ * add it to the IO-APIC irq-routing table:
+ */
+ memset(&entry,0,sizeof(entry));
+
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = INT_DEST_MODE;
+ entry.dest = cpu_mask_to_apicid(mask);
+ entry.mask = 0; /* enable IRQ */
+ entry.trigger = trigger;
+ entry.polarity = polarity;
+ entry.vector = cfg->vector;
+
+ /* Mask level triggered irqs.
+ * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+ */
+ if (trigger)
+ entry.mask = 1;
+
+ ioapic_register_intr(irq, trigger);
+ if (irq < 16)
+ disable_8259A_irq(irq);
+
+ ioapic_write_entry(apic, pin, entry);
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+ int apic, pin, idx, irq, first_notcon = 1;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+ idx = find_irq_entry(apic,pin,mp_INT);
+ if (idx == -1) {
+ if (first_notcon) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ first_notcon = 0;
+ } else
+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ continue;
+ }
+ if (!first_notcon) {
+ apic_printk(APIC_VERBOSE, " not connected.\n");
+ first_notcon = 1;
+ }
+
+ irq = pin_2_irq(idx, apic, pin);
+ add_pin_to_irq(irq, apic, pin);
+
+ setup_IO_APIC_irq(apic, pin, irq,
+ irq_trigger(idx), irq_polarity(idx));
+ }
+ }
+
+ if (!first_notcon)
+ apic_printk(APIC_VERBOSE, " not connected.\n");
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Set up the 8259A-master output pin as broadcast to all
+ * CPUs.
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+{
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+
+ disable_8259A_irq(0);
+
+ /* mask LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+ /*
+ * We use logical delivery to get the timer IRQ
+ * to the first CPU.
+ */
+ entry.dest_mode = INT_DEST_MODE;
+ entry.mask = 0; /* unmask IRQ now */
+ entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.polarity = 0;
+ entry.trigger = 0;
+ entry.vector = vector;
+
+ /*
+ * The timer IRQ doesn't have to know that behind the
+ * scene we have a 8259A-master in AEOI mode ...
+ */
+ set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(apic, pin, entry);
+
+ enable_8259A_irq(0);
+}
+
+void __apicdebuginit print_IO_APIC(void)
+{
+ int apic, i;
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+ unsigned long flags;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for (i = 0; i < nr_ioapics; i++)
+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+ /*
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
+ */
+ printk(KERN_INFO "testing the IO APIC.......................\n");
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ reg_01.raw = io_apic_read(apic, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(apic, 2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk("\n");
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+
+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01);
+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
+
+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+
+ if (reg_01.bits.version >= 0x10) {
+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ }
+
+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+ " Stat Dmod Deli Vect: \n");
+
+ for (i = 0; i <= reg_01.bits.entries; i++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapic_read_entry(apic, i);
+
+ printk(KERN_DEBUG " %02x %03X ",
+ i,
+ entry.dest
+ );
+
+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector
+ );
+ }
+ }
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ for (i = 0; i < NR_IRQS; i++) {
+ struct irq_pin_list *entry = irq_2_pin + i;
+ if (entry->pin < 0)
+ continue;
+ printk(KERN_DEBUG "IRQ%d ", i);
+ for (;;) {
+ printk("-> %d:%d", entry->apic, entry->pin);
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+ printk("\n");
+ }
+
+ printk(KERN_INFO ".................................... done.\n");
+
+ return;
+}
+
+static __apicdebuginit void print_APIC_bitfield (int base)
+{
+ unsigned int v;
+ int i, j;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+ for (i = 0; i < 8; i++) {
+ v = apic_read(base + i*0x10);
+ for (j = 0; j < 32; j++) {
+ if (v & (1<<j))
+ printk("1");
+ else
+ printk("0");
+ }
+ printk("\n");
+ }
+}
+
+void __apicdebuginit print_local_APIC(void * dummy)
+{
+ unsigned int v, ver, maxlvt;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
+ v = apic_read(APIC_LVR);
+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+
+ v = apic_read(APIC_EOI);
+ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ v = apic_read(APIC_LDR);
+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ v = apic_read(APIC_SPIV);
+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+ printk(KERN_DEBUG "... APIC ISR field:\n");
+ print_APIC_bitfield(APIC_ISR);
+ printk(KERN_DEBUG "... APIC TMR field:\n");
+ print_APIC_bitfield(APIC_TMR);
+ printk(KERN_DEBUG "... APIC IRR field:\n");
+ print_APIC_bitfield(APIC_IRR);
+
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+
+ v = apic_read(APIC_ICR);
+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+ v = apic_read(APIC_ICR2);
+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+ v = apic_read(APIC_LVTT);
+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) { /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) { /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+ printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+ on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void __apicdebuginit print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
+
+ outb(0x0b,0xa0);
+ outb(0x0b,0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a,0xa0);
+ outb(0x0a,0x20);
+
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+#endif /* !CONFIG_XEN */
+
+void __init enable_IO_APIC(void)
+{
+ union IO_APIC_reg_01 reg_01;
+#ifndef CONFIG_XEN
+ int i8259_apic, i8259_pin;
+#endif
+ int i, apic;
+ unsigned long flags;
+
+ for (i = 0; i < PIN_MAP_SIZE; i++) {
+ irq_2_pin[i].pin = -1;
+ irq_2_pin[i].next = 0;
+ }
+
+ /*
+ * The number of IO-APIC IRQ registers (== #pins):
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(apic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+ }
+#ifndef CONFIG_XEN
+ for(apic = 0; apic < nr_ioapics; apic++) {
+ int pin;
+ /* See if any of the pins is in ExtINT mode */
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ struct IO_APIC_route_entry entry;
+ entry = ioapic_read_entry(apic, pin);
+
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
+ }
+ }
+ }
+ found_i8259:
+ /* Look to see what if the MP table has reported the ExtINT */
+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+ /* Trust the MP table if nothing is setup in the hardware */
+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ ioapic_i8259.pin = i8259_pin;
+ ioapic_i8259.apic = i8259_apic;
+ }
+ /* Complain if the MP table and the hardware disagree */
+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ {
+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+ }
+#endif
+
+ /*
+ * Do not trust the IO-APIC being empty at bootup
+ */
+ clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+ /*
+ * Clear the IO-APIC before rebooting:
+ */
+ clear_IO_APIC();
+
+#ifndef CONFIG_XEN
+ /*
+ * If the i8259 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrupts can be delivered.
+ */
+ if (ioapic_i8259.pin != -1) {
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
+ entry.vector = 0;
+ entry.dest = GET_APIC_ID(read_apic_id());
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+ }
+
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+#endif
+}
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ * - timer IRQ defaults to IO-APIC IRQ
+ * - if this function detects that timer IRQs are defunct, then we fall
+ * back to ISA timer IRQs
+ */
+#ifndef CONFIG_XEN
+static int __init timer_irq_works(void)
+{
+ unsigned long t1 = jiffies;
+ unsigned long flags;
+
+ local_save_flags(flags);
+ local_irq_enable();
+ /* Let ten ticks pass... */
+ mdelay((10 * 1000) / HZ);
+ local_irq_restore(flags);
+
+ /*
+ * Expect a few ticks at least, to be sure some possible
+ * glue logic does not lock up after one or two first
+ * ticks in a non-ExtINT mode. Also the local APIC
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+
+ /* jiffies wrap? */
+ if (time_after(jiffies, t1 + 4))
+ return 1;
+ return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_ioapic_irq(unsigned int irq)
+{
+ int was_pending = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < 16) {
+ disable_8259A_irq(irq);
+ if (i8259A_irq_pending(irq))
+ was_pending = 1;
+ }
+ __unmask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+}
+
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+ struct irq_cfg *cfg = &irq_cfg[irq];
+ cpumask_t mask;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ mask = cpumask_of_cpu(first_cpu(cfg->domain));
+ send_IPI_mask(mask, cfg->vector);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+#ifdef CONFIG_SMP
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+ unsigned vector, me;
+ ack_APIC_irq();
+ exit_idle();
+ irq_enter();
+
+ me = smp_processor_id();
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ unsigned int irq;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ irq = __get_cpu_var(vector_irq)[vector];
+ if (irq >= NR_IRQS)
+ continue;
+
+ desc = irq_desc + irq;
+ cfg = irq_cfg + irq;
+ spin_lock(&desc->lock);
+ if (!cfg->move_cleanup_count)
+ goto unlock;
+
+ if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ goto unlock;
+
+ __get_cpu_var(vector_irq)[vector] = -1;
+ cfg->move_cleanup_count--;
+unlock:
+ spin_unlock(&desc->lock);
+ }
+
+ irq_exit();
+}
+
+static void irq_complete_move(unsigned int irq)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ unsigned vector, me;
+
+ if (likely(!cfg->move_in_progress))
+ return;
+
+ vector = ~get_irq_regs()->orig_ax;
+ me = smp_processor_id();
+ if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
+ cpumask_t cleanup_mask;
+
+ cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+ cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ cfg->move_in_progress = 0;
+ }
+}
+#else
+static inline void irq_complete_move(unsigned int irq) {}
+#endif
+
+static void ack_apic_edge(unsigned int irq)
+{
+ irq_complete_move(irq);
+ move_native_irq(irq);
+ ack_APIC_irq();
+}
+
+static void ack_apic_level(unsigned int irq)
+{
+ int do_unmask_irq = 0;
+
+ irq_complete_move(irq);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ /* If we are moving the irq we need to mask it */
+ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+ do_unmask_irq = 1;
+ mask_IO_APIC_irq(irq);
+ }
+#endif
+
+ /*
+ * We must acknowledge the irq before we move it or the acknowledge will
+ * not propagate properly.
+ */
+ ack_APIC_irq();
+
+ /* Now we can move and renable the irq */
+ if (unlikely(do_unmask_irq)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(irq))
+ move_masked_irq(irq);
+ unmask_IO_APIC_irq(irq);
+ }
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+ .name = "IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_apic_edge,
+ .eoi = ack_apic_level,
+#ifdef CONFIG_SMP
+ .set_affinity = set_ioapic_affinity_irq,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+#endif /* !CONFIG_XEN */
+
+static inline void init_IO_APIC_traps(void)
+{
+ int irq;
+
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ for (irq = 0; irq < NR_IRQS ; irq++) {
+ if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
+ /*
+ * Hmm.. We don't have an entry for this,
+ * so default to an old-fashioned 8259
+ * interrupt if we can..
+ */
+ if (irq < 16)
+ make_8259A_irq(irq);
+#ifndef CONFIG_XEN
+ else
+ /* Strange. Oh, well.. */
+ irq_desc[irq].chip = &no_irq_chip;
+#endif
+ }
+ }
+}
+
+#ifndef CONFIG_XEN
+static void enable_lapic_irq (unsigned int irq)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void disable_lapic_irq (unsigned int irq)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq (unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+static void end_lapic_irq (unsigned int i) { /* nothing */ }
+
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
+ .name = "local-APIC",
+ .typename = "local-APIC-edge",
+ .startup = NULL, /* startup_irq() not used for IRQ0 */
+ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
+ .enable = enable_lapic_irq,
+ .disable = disable_lapic_irq,
+ .ack = ack_lapic_irq,
+ .end = end_lapic_irq,
+};
+
+static void __init setup_nmi(void)
+{
+ /*
+ * Dirty trick to enable the NMI watchdog ...
+ * We put the 8259A master into AEOI mode and
+ * unmask on all local APICs LVT0 as NMI.
+ *
+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+ * is from Maciej W. Rozycki - so we do not have to EOI from
+ * the NMI handler or the timer interrupt.
+ */
+ printk(KERN_INFO "activating NMI Watchdog ...");
+
+ enable_NMI_through_LVT0();
+
+ printk(" done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
+ * not support the ExtINT mode, unfortunately. We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA. --macro
+ */
+static inline void __init unlock_ExtINT_logic(void)
+{
+ int apic, pin, i;
+ struct IO_APIC_route_entry entry0, entry1;
+ unsigned char save_control, save_freq_select;
+
+ pin = find_isa_irq_pin(8, mp_INT);
+ apic = find_isa_irq_apic(8, mp_INT);
+ if (pin == -1)
+ return;
+
+ entry0 = ioapic_read_entry(apic, pin);
+
+ clear_IO_APIC_pin(apic, pin);
+
+ memset(&entry1, 0, sizeof(entry1));
+
+ entry1.dest_mode = 0; /* physical delivery */
+ entry1.mask = 0; /* unmask IRQ now */
+ entry1.dest = hard_smp_processor_id();
+ entry1.delivery_mode = dest_ExtINT;
+ entry1.polarity = entry0.polarity;
+ entry1.trigger = 0;
+ entry1.vector = 0;
+
+ ioapic_write_entry(apic, pin, entry1);
+
+ save_control = CMOS_READ(RTC_CONTROL);
+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+ RTC_FREQ_SELECT);
+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+ i = 100;
+ while (i-- > 0) {
+ mdelay(10);
+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+ i -= 10;
+ }
+
+ CMOS_WRITE(save_control, RTC_CONTROL);
+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+ clear_IO_APIC_pin(apic, pin);
+
+ ioapic_write_entry(apic, pin, entry0);
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for modern platforms only.
+ */
+static inline void __init check_timer(void)
+{
+ struct irq_cfg *cfg = irq_cfg + 0;
+ int apic1, pin1, apic2, pin2;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ /*
+ * get/set the timer IRQ vector:
+ */
+ disable_8259A_irq(0);
+ assign_irq_vector(0, TARGET_CPUS);
+
+ /*
+ * Subtle, code in do_timer_interrupt() expects an AEOI
+ * mode for the 8259A whenever interrupts are routed
+ * through I/O APICs. Also IRQ0 has to be enabled in
+ * the 8259A which implies the virtual wire has to be
+ * disabled in the local APIC.
+ */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ init_8259A(1);
+ if (timer_over_8254 > 0)
+ enable_8259A_irq(0);
+
+ pin1 = find_isa_irq_pin(0, mp_INT);
+ apic1 = find_isa_irq_apic(0, mp_INT);
+ pin2 = ioapic_i8259.pin;
+ apic2 = ioapic_i8259.apic;
+
+ apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
+
+ if (pin1 != -1) {
+ /*
+ * Ok, does IRQ0 through the IOAPIC work?
+ */
+ unmask_IO_APIC_irq(0);
+ if (!no_timer_check && timer_irq_works()) {
+ nmi_watchdog_default();
+ if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
+ setup_nmi();
+ enable_8259A_irq(0);
+ }
+ if (disable_timer_pin_1 > 0)
+ clear_IO_APIC_pin(0, pin1);
+ goto out;
+ }
+ clear_IO_APIC_pin(apic1, pin1);
+ apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
+ "connected to IO-APIC\n");
+ }
+
+ apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
+ "through the 8259A ... ");
+ if (pin2 != -1) {
+ apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
+ apic2, pin2);
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+ setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+ if (timer_irq_works()) {
+ apic_printk(APIC_VERBOSE," works.\n");
+ nmi_watchdog_default();
+ if (nmi_watchdog == NMI_IO_APIC) {
+ setup_nmi();
+ }
+ goto out;
+ }
+ /*
+ * Cleanup, just in case ...
+ */
+ clear_IO_APIC_pin(apic2, pin2);
+ }
+ apic_printk(APIC_VERBOSE," failed.\n");
+
+ if (nmi_watchdog == NMI_IO_APIC) {
+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+ nmi_watchdog = 0;
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+ disable_8259A_irq(0);
+ irq_desc[0].chip = &lapic_irq_type;
+ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
+ enable_8259A_irq(0);
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_VERBOSE," works.\n");
+ goto out;
+ }
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+ apic_printk(APIC_VERBOSE," failed.\n");
+
+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+ init_8259A(0);
+ make_8259A_irq(0);
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
+
+ unlock_ExtINT_logic();
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_VERBOSE," works.\n");
+ goto out;
+ }
+ apic_printk(APIC_VERBOSE," failed :(.\n");
+ panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+out:
+ local_irq_restore(flags);
+}
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+#else
+#define check_timer() ((void)0)
+int timer_uses_ioapic_pin_0 = 0;
+#endif /* !CONFIG_XEN */
+
+/*
+ *
+ * IRQs that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ * Linux doesn't really care, as it's not actually used
+ * for any interrupt handling anyway.
+ */
+#define PIC_IRQS (1<<2)
+
+void __init setup_IO_APIC(void)
+{
+ enable_IO_APIC();
+
+ if (acpi_ioapic)
+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
+ else
+ io_apic_irqs = ~PIC_IRQS;
+
+ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+
+#ifndef CONFIG_XEN
+ sync_Arb_IDs();
+#endif /* !CONFIG_XEN */
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ check_timer();
+ if (!acpi_ioapic)
+ print_IO_APIC();
+}
+
+struct sysfs_ioapic_data {
+ struct sys_device dev;
+ struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+ struct IO_APIC_route_entry *entry;
+ struct sysfs_ioapic_data *data;
+ int i;
+
+ data = container_of(dev, struct sysfs_ioapic_data, dev);
+ entry = data->entry;
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
+ *entry = ioapic_read_entry(dev->id, i);
+
+ return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+ struct IO_APIC_route_entry *entry;
+ struct sysfs_ioapic_data *data;
+ unsigned long flags;
+ union IO_APIC_reg_00 reg_00;
+ int i;
+
+ data = container_of(dev, struct sysfs_ioapic_data, dev);
+ entry = data->entry;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(dev->id, 0);
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ io_apic_write(dev->id, 0, reg_00.raw);
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
+ ioapic_write_entry(dev->id, i, entry[i]);
+
+ return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+ .name = "ioapic",
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+ struct sys_device * dev;
+ int i, size, error;
+
+ error = sysdev_class_register(&ioapic_sysdev_class);
+ if (error)
+ return error;
+
+ for (i = 0; i < nr_ioapics; i++ ) {
+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+ * sizeof(struct IO_APIC_route_entry);
+ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
+ if (!mp_ioapic_data[i]) {
+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+ continue;
+ }
+ dev = &mp_ioapic_data[i]->dev;
+ dev->id = i;
+ dev->cls = &ioapic_sysdev_class;
+ error = sysdev_register(dev);
+ if (error) {
+ kfree(mp_ioapic_data[i]);
+ mp_ioapic_data[i] = NULL;
+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+ continue;
+ }
+ }
+
+ return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+#ifndef CONFIG_XEN
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+ /* Allocate an unused irq */
+ int irq;
+ int new;
+ unsigned long flags;
+
+ irq = -ENOSPC;
+ spin_lock_irqsave(&vector_lock, flags);
+ for (new = (NR_IRQS - 1); new >= 0; new--) {
+ if (platform_legacy_irq(new))
+ continue;
+ if (irq_cfg[new].vector != 0)
+ continue;
+ if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+ irq = new;
+ break;
+ }
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (irq >= 0) {
+ dynamic_irq_init(irq);
+ }
+ return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+ unsigned long flags;
+
+ dynamic_irq_cleanup(irq);
+
+ spin_lock_irqsave(&vector_lock, flags);
+ __clear_irq_vector(irq);
+ spin_unlock_irqrestore(&vector_lock, flags);
+}
+#endif
+
+/*
+ * MSI message composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ int err;
+ unsigned dest;
+ cpumask_t tmp;
+
+ tmp = TARGET_CPUS;
+ err = assign_irq_vector(irq, tmp);
+ if (!err) {
+ cpus_and(tmp, cfg->domain, tmp);
+ dest = cpu_mask_to_apicid(tmp);
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((INT_DEST_MODE == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL:
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU:
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED:
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(cfg->vector);
+ }
+ return err;
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ struct msi_msg msg;
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ read_msi_msg(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ write_msi_msg(irq, &msg);
+ irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+ .name = "PCI-MSI",
+ .unmask = unmask_msi_irq,
+ .mask = mask_msi_irq,
+ .ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = set_msi_irq_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+ struct msi_msg msg;
+ int irq, ret;
+ irq = create_irq();
+ if (irq < 0)
+ return irq;
+
+ ret = msi_compose_msg(dev, irq, &msg);
+ if (ret < 0) {
+ destroy_irq(irq);
+ return ret;
+ }
+
+ set_irq_msi(irq, desc);
+ write_msi_msg(irq, &msg);
+
+ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+
+ return 0;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+ destroy_irq(irq);
+}
+
+#ifdef CONFIG_DMAR
+#ifdef CONFIG_SMP
+static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ struct msi_msg msg;
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ dmar_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ dmar_msi_write(irq, &msg);
+ irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+struct irq_chip dmar_msi_type = {
+ .name = "DMAR_MSI",
+ .unmask = dmar_msi_unmask,
+ .mask = dmar_msi_mask,
+ .ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = dmar_msi_set_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg);
+ if (ret < 0)
+ return ret;
+ dmar_msi_write(irq, &msg);
+ set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+ "edge");
+ return 0;
+}
+#endif
+
+#endif /* CONFIG_PCI_MSI */
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+ struct ht_irq_msg msg;
+ fetch_ht_irq_msg(irq, &msg);
+
+ msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+ msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+ write_ht_irq_msg(irq, &msg);
+}
+
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ target_ht_irq(irq, dest, cfg->vector);
+ irq_desc[irq].affinity = mask;
+}
+#endif
+
+static struct irq_chip ht_irq_chip = {
+ .name = "PCI-HT",
+ .mask = mask_ht_irq,
+ .unmask = unmask_ht_irq,
+ .ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = set_ht_irq_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+ struct irq_cfg *cfg = irq_cfg + irq;
+ int err;
+ cpumask_t tmp;
+
+ tmp = TARGET_CPUS;
+ err = assign_irq_vector(irq, tmp);
+ if (!err) {
+ struct ht_irq_msg msg;
+ unsigned dest;
+
+ cpus_and(tmp, cfg->domain, tmp);
+ dest = cpu_mask_to_apicid(tmp);
+
+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+ msg.address_lo =
+ HT_IRQ_LOW_BASE |
+ HT_IRQ_LOW_DEST_ID(dest) |
+ HT_IRQ_LOW_VECTOR(cfg->vector) |
+ ((INT_DEST_MODE == 0) ?
+ HT_IRQ_LOW_DM_PHYSICAL :
+ HT_IRQ_LOW_DM_LOGICAL) |
+ HT_IRQ_LOW_RQEOI_EDGE |
+ ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ HT_IRQ_LOW_MT_FIXED :
+ HT_IRQ_LOW_MT_ARBITRATED) |
+ HT_IRQ_LOW_IRQ_MASKED;
+
+ write_ht_irq_msg(irq, &msg);
+
+ set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+ handle_edge_irq, "edge");
+ }
+ return err;
+}
+#endif /* CONFIG_HT_IRQ */
+
+/* --------------------------------------------------------------------------
+ ACPI-based IOAPIC Configuration
+ -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+#define IO_APIC_MAX_ID 0xFE
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
+{
+ if (!IO_APIC_IRQ(irq)) {
+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+ ioapic);
+ return -EINVAL;
+ }
+
+ /*
+ * IRQs < 16 are already in the irq_2_pin[] map
+ */
+ if (irq >= 16)
+ add_pin_to_irq(irq, ioapic, pin);
+
+ setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+
+ return 0;
+}
+
+
+int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+{
+ int i;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mpc_irqtype == mp_INT &&
+ mp_irqs[i].mpc_srcbusirq == bus_irq)
+ break;
+ if (i >= mp_irq_entries)
+ return -1;
+
+ *trigger = irq_trigger(i);
+ *polarity = irq_polarity(i);
+ return 0;
+}
+
+#endif /* CONFIG_ACPI */
+
+#ifndef CONFIG_XEN
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+ int pin, ioapic, irq, irq_entry;
+
+ if (skip_ioapic_setup == 1)
+ return;
+
+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+ if (irq_entry == -1)
+ continue;
+ irq = pin_2_irq(irq_entry, ioapic, pin);
+
+ /* setup_IO_APIC_irqs could fail to get vector for some device
+ * when you have too many devices, because at that time only boot
+ * cpu is online.
+ */
+ if (!irq_cfg[irq].vector)
+ setup_IO_APIC_irq(ioapic, pin, irq,
+ irq_trigger(irq_entry),
+ irq_polarity(irq_entry));
+ else
+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
+ }
+
+ }
+}
+#endif
+
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(void)
+{
+ unsigned long n;
+ struct resource *res;
+ char *mem;
+ int i;
+
+ if (nr_ioapics <= 0)
+ return NULL;
+
+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+ n *= nr_ioapics;
+
+ mem = alloc_bootmem(n);
+ res = (void *)mem;
+
+ if (mem != NULL) {
+ mem += sizeof(struct resource) * nr_ioapics;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ sprintf(mem, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
+ }
+ }
+
+ ioapic_resources = res;
+
+ return res;
+}
+
+void __init ioapic_init_mappings(void)
+{
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ struct resource *ioapic_res;
+ int i;
+
+ ioapic_res = ioapic_setup_resources();
+ for (i = 0; i < nr_ioapics; i++) {
+ if (smp_found_config) {
+ ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+ } else {
+ ioapic_phys = (unsigned long)
+ alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ set_fixmap_nocache(idx, ioapic_phys);
+ apic_printk(APIC_VERBOSE,
+ "mapped IOAPIC to %016lx (%016lx)\n",
+ __fix_to_virt(idx), ioapic_phys);
+ idx++;
+
+ if (ioapic_res != NULL) {
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+ ioapic_res++;
+ }
+ }
+}
+
+static int __init ioapic_insert_resources(void)
+{
+ int i;
+ struct resource *r = ioapic_resources;
+
+ if (!r) {
+ printk(KERN_ERR
+ "IO APIC resources could be not be allocated.\n");
+ return -1;
+ }
+
+ for (i = 0; i < nr_ioapics; i++) {
+ insert_resource(&iomem_resource, r);
+ r++;
+ }
+
+ return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+#endif /* !CONFIG_XEN */
--- /dev/null
+/*
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus. 32/64 bits code unification by Miguel Botón.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+#include <xen/interface/physdev.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base,
+ unsigned int extent, int new_value)
+{
+ unsigned int i;
+
+ for (i = base; i < base + extent; i++) {
+ if (new_value)
+ __set_bit(i, bitmap);
+ else
+ __clear_bit(i, bitmap);
+ }
+}
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ struct thread_struct * t = ¤t->thread;
+ struct physdev_set_iobitmap set_iobitmap;
+
+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+ return -EINVAL;
+ if (turn_on && !capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ /*
+ * If it's the first ioperm() call in this thread's lifetime, set the
+ * IO bitmap up. ioperm() is much less timing critical than clone(),
+ * this is why we delay this operation until now:
+ */
+ if (!t->io_bitmap_ptr) {
+ unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
+ if (!bitmap)
+ return -ENOMEM;
+
+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
+ t->io_bitmap_ptr = bitmap;
+ set_thread_flag(TIF_IO_BITMAP);
+
+ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
+ set_iobitmap.nr_ports = IO_BITMAP_BITS;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+ &set_iobitmap));
+ }
+
+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+ return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ */
+static int do_iopl(unsigned int level, struct thread_struct *t)
+{
+ unsigned int old = t->iopl >> 12;
+
+ if (level > 3)
+ return -EINVAL;
+ /* Trying to gain more privileges? */
+ if (level > old) {
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage long sys_iopl(unsigned long regsp)
+{
+ struct pt_regs *regs = (struct pt_regs *)®sp;
+ unsigned int level = regs->bx;
+#else
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+#endif
+ struct thread_struct *t = ¤t->thread;
+ int rc;
+
+ rc = do_iopl(level, t);
+ if (rc < 0)
+ goto out;
+
+ t->iopl = level << 12;
+ set_iopl_mask(t->iopl);
+out:
+ return rc;
+}
--- /dev/null
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/proto.h>
+
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+#include <mach_apic.h>
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline int __prepare_ICR(unsigned int shortcut, int vector)
+{
+ unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_APIC_DEST_FIELD(mask);
+}
+#else
+#include <xen/evtchn.h>
+
+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
+
+static inline void __send_IPI_one(unsigned int cpu, int vector)
+{
+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
+ BUG_ON(irq < 0);
+ notify_remote_via_irq(irq);
+}
+#endif
+
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+#ifndef CONFIG_XEN
+ /*
+ * Subtle. In the case of the 'never do double writes' workaround
+ * we have to lock out interrupts to be safe. As we don't care
+ * of the value read we use an atomic rmw access to avoid costly
+ * cli/sti. Otherwise we use an even cheaper single atomic write
+ * to the APIC.
+ */
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ /*
+ * No need to touch the target chip field
+ */
+ cfg = __prepare_ICR(shortcut, vector);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write_around(APIC_ICR, cfg);
+#else
+ int cpu;
+
+ switch (shortcut) {
+ case APIC_DEST_SELF:
+ __send_IPI_one(smp_processor_id(), vector);
+ break;
+ case APIC_DEST_ALLBUT:
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu == smp_processor_id())
+ continue;
+ if (cpu_isset(cpu, cpu_online_map)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
+ break;
+ default:
+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
+ vector);
+ break;
+ }
+#endif
+}
+
+void send_IPI_self(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
+{
+ unsigned long cfg;
+
+ /*
+ * Wait for idle.
+ */
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ apic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ apic_write_around(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write_around(APIC_ICR, cfg);
+}
+#endif
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+ unsigned long mask = cpus_addr(cpumask)[0];
+ unsigned long flags;
+#ifdef CONFIG_XEN
+ unsigned int cpu;
+#endif
+
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+#ifndef CONFIG_XEN
+ __send_IPI_dest_field(mask, vector);
+#else
+ for (cpu = 0; cpu < NR_CPUS; ++cpu)
+ if (cpu_isset(cpu, cpumask))
+ __send_IPI_one(cpu, vector);
+#endif
+ local_irq_restore(flags);
+}
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+#ifndef CONFIG_XEN
+ unsigned long flags;
+ unsigned int query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+ * should be modified to do 1 message per cluster ID - mbligh
+ */
+
+ local_irq_save(flags);
+ for_each_possible_cpu(query_cpu) {
+ if (cpu_isset(query_cpu, mask)) {
+ __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+ vector);
+ }
+ }
+ local_irq_restore(flags);
+#else
+ send_IPI_mask_bitmask(mask, vector);
+#endif
+}
+
+/* must come after the send_IPI functions above for inlining */
+#include <mach_ipi.h>
+
+#ifndef CONFIG_XEN
+static int convert_apicid_to_cpu(int apic_id)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
+ return i;
+ }
+ return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+ int apicid, cpuid;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return 0;
+
+ apicid = hard_smp_processor_id();
+ if (apicid == BAD_APICID)
+ return 0;
+
+ cpuid = convert_apicid_to_cpu(apicid);
+
+ return cpuid >= 0 ? cpuid : 0;
+}
+#endif
+#endif
--- /dev/null
+/*
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/uaccess.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ * But only ack when the APIC is enabled -AK
+ */
+ if (cpu_has_apic)
+ ack_APIC_irq();
+#endif
+}
+
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+ struct thread_info tinfo;
+ u32 stack[THREAD_SIZE/sizeof(u32)];
+};
+
+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+#endif
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+ /* high bit used in ret_from_ code */
+ int irq = ~regs->orig_ax;
+ struct irq_desc *desc = irq_desc + irq;
+#ifdef CONFIG_4KSTACKS
+ union irq_ctx *curctx, *irqctx;
+ u32 *isp;
+#endif
+
+ if (unlikely((unsigned)irq >= NR_IRQS)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+ __func__, irq);
+ BUG();
+ }
+
+ old_regs = set_irq_regs(regs);
+ irq_enter();
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ /* Debugging check for stack overflow: is there less than 1KB free? */
+ {
+ long sp;
+
+ __asm__ __volatile__("andl %%esp,%0" :
+ "=r" (sp) : "0" (THREAD_SIZE - 1));
+ if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
+ printk("do_IRQ: stack overflow: %ld\n",
+ sp - sizeof(struct thread_info));
+ dump_stack();
+ }
+ }
+#endif
+
+#ifdef CONFIG_4KSTACKS
+
+ curctx = (union irq_ctx *) current_thread_info();
+ irqctx = hardirq_ctx[smp_processor_id()];
+
+ /*
+ * this is where we switch to the IRQ stack. However, if we are
+ * already using the IRQ stack (because we interrupted a hardirq
+ * handler) we can't do that and just have to keep using the
+ * current stack (which is the irq stack already after all)
+ */
+ if (curctx != irqctx) {
+ int arg1, arg2, bx;
+
+ /* build the stack frame on the IRQ stack */
+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+ irqctx->tinfo.task = curctx->tinfo.task;
+ irqctx->tinfo.previous_esp = current_stack_pointer;
+
+ /*
+ * Copy the softirq bits in preempt_count so that the
+ * softirq checks work in the hardirq context.
+ */
+ irqctx->tinfo.preempt_count =
+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+ asm volatile(
+ " xchgl %%ebx,%%esp \n"
+ " call *%%edi \n"
+ " movl %%ebx,%%esp \n"
+ : "=a" (arg1), "=d" (arg2), "=b" (bx)
+ : "0" (irq), "1" (desc), "2" (isp),
+ "D" (desc->handle_irq)
+ : "memory", "cc", "ecx"
+ );
+ } else
+#endif
+ desc->handle_irq(irq, desc);
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+#ifdef CONFIG_4KSTACKS
+
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__section__(".bss.page_aligned")));
+
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__section__(".bss.page_aligned")));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+ union irq_ctx *irqctx;
+
+ if (hardirq_ctx[cpu])
+ return;
+
+ irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+
+ hardirq_ctx[cpu] = irqctx;
+
+ irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = 0;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+
+ softirq_ctx[cpu] = irqctx;
+
+ printk("CPU %u irqstacks, hard=%p soft=%p\n",
+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+void irq_ctx_exit(int cpu)
+{
+ hardirq_ctx[cpu] = NULL;
+}
+
+asmlinkage void do_softirq(void)
+{
+ unsigned long flags;
+ struct thread_info *curctx;
+ union irq_ctx *irqctx;
+ u32 *isp;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+
+ if (local_softirq_pending()) {
+ curctx = current_thread_info();
+ irqctx = softirq_ctx[smp_processor_id()];
+ irqctx->tinfo.task = curctx->task;
+ irqctx->tinfo.previous_esp = current_stack_pointer;
+
+ /* build the stack frame on the softirq stack */
+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+ asm volatile(
+ " xchgl %%ebx,%%esp \n"
+ " call __do_softirq \n"
+ " movl %%ebx,%%esp \n"
+ : "=b"(isp)
+ : "0"(isp)
+ : "memory", "cc", "edx", "ecx", "eax"
+ );
+ /*
+ * Shouldnt happen, we returned above if in_interrupt():
+ */
+ WARN_ON_ONCE(softirq_count());
+ }
+
+ local_irq_restore(flags);
+}
+#endif
+
+/*
+ * Interrupt statistics:
+ */
+
+#ifndef CONFIG_XEN
+atomic_t irq_err_count;
+#endif
+
+/*
+ * /proc/interrupts printing:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ int i = *(loff_t *) v, j;
+ struct irqaction * action;
+ unsigned long flags;
+
+ if (i == 0) {
+ seq_printf(p, " ");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d",j);
+ seq_putc(p, '\n');
+ }
+
+ if (i < NR_IRQS) {
+ unsigned any_count = 0;
+
+ spin_lock_irqsave(&irq_desc[i].lock, flags);
+#ifndef CONFIG_SMP
+ any_count = kstat_irqs(i);
+#else
+ for_each_online_cpu(j)
+ any_count |= kstat_cpu(j).irqs[i];
+#endif
+ action = irq_desc[i].action;
+ if (!action && !any_count)
+ goto skip;
+ seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+ seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+ seq_printf(p, " %8s", irq_desc[i].chip->name);
+ seq_printf(p, "-%-8s", irq_desc[i].name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+
+ seq_putc(p, '\n');
+skip:
+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+ } else if (i == NR_IRQS) {
+ seq_printf(p, "NMI: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", nmi_count(j));
+ seq_printf(p, " Non-maskable interrupts\n");
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+ seq_printf(p, "LOC: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ per_cpu(irq_stat,j).apic_timer_irqs);
+ seq_printf(p, " Local timer interrupts\n");
+#endif
+#ifdef CONFIG_SMP
+ seq_printf(p, "RES: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ per_cpu(irq_stat,j).irq_resched_count);
+ seq_printf(p, " Rescheduling interrupts\n");
+ seq_printf(p, "CAL: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ per_cpu(irq_stat,j).irq_call_count);
+ seq_printf(p, " function call interrupts\n");
+#ifndef CONFIG_XEN
+ seq_printf(p, "TLB: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ per_cpu(irq_stat,j).irq_tlb_count);
+ seq_printf(p, " TLB shootdowns\n");
+#endif
+#endif
+#ifdef CONFIG_X86_MCE
+ seq_printf(p, "TRM: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ per_cpu(irq_stat,j).irq_thermal_count);
+ seq_printf(p, " Thermal event interrupts\n");
+#endif
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "SPU: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ per_cpu(irq_stat,j).irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
+#endif
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+#endif
+ }
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void fixup_irqs(cpumask_t map)
+{
+ unsigned int irq;
+ static int warned;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ cpumask_t mask;
+ if (irq == 2)
+ continue;
+
+ cpus_and(mask, irq_desc[irq].affinity, map);
+ if (any_online_cpu(mask) == NR_CPUS) {
+ /*printk("Breaking affinity for irq %i\n", irq);*/
+ mask = map;
+ }
+ if (irq_desc[irq].chip->set_affinity)
+ irq_desc[irq].chip->set_affinity(irq, mask);
+ else if (irq_desc[irq].action && !(warned++))
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+#if 0
+ barrier();
+ /* Ingo Molnar says: "after the IO-APIC masks have been redirected
+ [note the nop - the interrupt-enable boundary on x86 is two
+ instructions from sti] - to flush out pending hardirqs and
+ IPIs. After this point nothing is supposed to reach this CPU." */
+ __asm__ __volatile__("sti; nop; cli");
+ barrier();
+#else
+ /* That doesn't seem sufficient. Give it 1ms. */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+#endif
+}
+#endif
+
--- /dev/null
+/*
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io_apic.h>
+#include <asm/idle.h>
+#include <asm/smp.h>
+
+#ifndef CONFIG_XEN
+atomic_t irq_err_count;
+#endif
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
+}
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/*
+ * Probabilistic stack overflow check:
+ *
+ * Only check the stack in process context, because everything else
+ * runs on the big interrupt stacks. Checking reliably is too expensive,
+ * so we just check from interrupts.
+ */
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+ u64 curbase = (u64)task_stack_page(current);
+ static unsigned long warned = -60*HZ;
+
+ if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
+ regs->sp < curbase + sizeof(struct thread_info) + 128 &&
+ time_after(jiffies, warned + 60*HZ)) {
+ printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+ current->comm, curbase, regs->sp);
+ show_stack(NULL,NULL);
+ warned = jiffies;
+ }
+}
+#endif
+
+/*
+ * Generic, controller-independent functions:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ int i = *(loff_t *) v, j;
+ struct irqaction * action;
+ unsigned long flags;
+
+ if (i == 0) {
+ seq_printf(p, " ");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d",j);
+ seq_putc(p, '\n');
+ }
+
+ if (i < NR_IRQS) {
+ unsigned any_count = 0;
+
+ spin_lock_irqsave(&irq_desc[i].lock, flags);
+#ifndef CONFIG_SMP
+ any_count = kstat_irqs(i);
+#else
+ for_each_online_cpu(j)
+ any_count |= kstat_cpu(j).irqs[i];
+#endif
+ action = irq_desc[i].action;
+ if (!action && !any_count)
+ goto skip;
+ seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+ seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+ seq_printf(p, " %8s", irq_desc[i].chip->name);
+ seq_printf(p, "-%-8s", irq_desc[i].name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+ seq_putc(p, '\n');
+skip:
+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+ } else if (i == NR_IRQS) {
+ seq_printf(p, "NMI: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
+ seq_printf(p, " Non-maskable interrupts\n");
+#ifndef CONFIG_XEN
+ seq_printf(p, "LOC: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
+ seq_printf(p, " Local timer interrupts\n");
+#endif
+#ifdef CONFIG_SMP
+ seq_printf(p, "RES: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
+ seq_printf(p, " Rescheduling interrupts\n");
+ seq_printf(p, "CAL: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
+ seq_printf(p, " function call interrupts\n");
+#ifndef CONFIG_XEN
+ seq_printf(p, "TLB: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
+ seq_printf(p, " TLB shootdowns\n");
+#endif
+#endif
+#ifdef CONFIG_X86_MCE
+ seq_printf(p, "TRM: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
+ seq_printf(p, " Thermal event interrupts\n");
+ seq_printf(p, "THR: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
+ seq_printf(p, " Threshold APIC interrupts\n");
+#endif
+#ifndef CONFIG_XEN
+ seq_printf(p, "SPU: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#endif
+ }
+ return 0;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* high bit used in ret_from_ code */
+ unsigned irq = ~regs->orig_ax;
+
+ exit_idle();
+ irq_enter();
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ stack_overflow_check(regs);
+#endif
+
+ if (likely(irq < NR_IRQS))
+ generic_handle_irq(irq);
+ else {
+#ifndef CONFIG_XEN
+ if (!disable_apic)
+ ack_APIC_irq();
+#endif
+ if (printk_ratelimit())
+ printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
+ __func__, smp_processor_id(), irq);
+ }
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+ unsigned int irq;
+ static int warned;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ cpumask_t mask;
+ int break_affinity = 0;
+ int set_affinity = 1;
+
+ if (irq == 2)
+ continue;
+
+ /* interrupt's are disabled at this point */
+ spin_lock(&irq_desc[irq].lock);
+
+ if (!irq_has_action(irq) ||
+ cpus_equal(irq_desc[irq].affinity, map)) {
+ spin_unlock(&irq_desc[irq].lock);
+ continue;
+ }
+
+ cpus_and(mask, irq_desc[irq].affinity, map);
+ if (cpus_empty(mask)) {
+ break_affinity = 1;
+ mask = map;
+ }
+
+ if (irq_desc[irq].chip->mask)
+ irq_desc[irq].chip->mask(irq);
+
+ if (irq_desc[irq].chip->set_affinity)
+ irq_desc[irq].chip->set_affinity(irq, mask);
+ else if (!(warned++))
+ set_affinity = 0;
+
+ if (irq_desc[irq].chip->unmask)
+ irq_desc[irq].chip->unmask(irq);
+
+ spin_unlock(&irq_desc[irq].lock);
+
+ if (break_affinity && set_affinity)
+ printk("Broke affinity for irq %i\n", irq);
+ else if (!set_affinity)
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+ /* That doesn't seem sufficient. Give it 1ms. */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+}
+#endif
+
+extern void call_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+ __u32 pending;
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ pending = local_softirq_pending();
+ /* Switch to interrupt stack */
+ if (pending) {
+ call_softirq();
+ WARN_ON_ONCE(softirq_count());
+ }
+ local_irq_restore(flags);
+}
--- /dev/null
+/*
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+#ifdef CONFIG_SMP
+static void flush_ldt(void *null)
+{
+ if (current->active_mm)
+ load_LDT(¤t->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+ void *oldldt, *newldt;
+ int oldsize;
+
+ if (mincount <= pc->size)
+ return 0;
+ oldsize = pc->size;
+ mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
+ (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
+ if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+ newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+ else
+ newldt = (void *)__get_free_page(GFP_KERNEL);
+
+ if (!newldt)
+ return -ENOMEM;
+
+ if (oldsize)
+ memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
+ oldldt = pc->ldt;
+ memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+ (mincount - oldsize) * LDT_ENTRY_SIZE);
+
+#ifdef CONFIG_X86_64
+ /* CHECKME: Do we really need this ? */
+ wmb();
+#endif
+ pc->ldt = newldt;
+ wmb();
+ pc->size = mincount;
+ wmb();
+
+ if (reload) {
+#ifdef CONFIG_SMP
+ cpumask_t mask;
+
+ preempt_disable();
+#endif
+ make_pages_readonly(newldt,
+ (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
+ XENFEAT_writable_descriptor_tables);
+ load_LDT(pc);
+#ifdef CONFIG_SMP
+ mask = cpumask_of_cpu(smp_processor_id());
+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+ smp_call_function(flush_ldt, NULL, 1, 1);
+ preempt_enable();
+#endif
+ }
+ if (oldsize) {
+ make_pages_writable(oldldt,
+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
+ XENFEAT_writable_descriptor_tables);
+ if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(oldldt);
+ else
+ put_page(virt_to_page(oldldt));
+ }
+ return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+ int err = alloc_ldt(new, old->size, 0);
+
+ if (err < 0)
+ return err;
+ memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+ make_pages_readonly(new->ldt,
+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+ XENFEAT_writable_descriptor_tables);
+ return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct mm_struct *old_mm;
+ int retval = 0;
+
+ memset(&mm->context, 0, sizeof(mm->context));
+ mutex_init(&mm->context.lock);
+ old_mm = current->mm;
+ if (old_mm)
+ mm->context.vdso = old_mm->context.vdso;
+ if (old_mm && old_mm->context.size > 0) {
+ mutex_lock(&old_mm->context.lock);
+ retval = copy_ldt(&mm->context, &old_mm->context);
+ mutex_unlock(&old_mm->context.lock);
+ }
+ return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+ if (mm->context.size) {
+ /* CHECKME: Can this ever happen ? */
+ if (mm == current->active_mm)
+ clear_LDT();
+ make_pages_writable(mm->context.ldt,
+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+ XENFEAT_writable_descriptor_tables);
+ if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(mm->context.ldt);
+ else
+ put_page(virt_to_page(mm->context.ldt));
+ mm->context.size = 0;
+ }
+}
+
+static int read_ldt(void __user *ptr, unsigned long bytecount)
+{
+ int err;
+ unsigned long size;
+ struct mm_struct *mm = current->mm;
+
+ if (!mm->context.size)
+ return 0;
+ if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+ bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
+
+ mutex_lock(&mm->context.lock);
+ size = mm->context.size * LDT_ENTRY_SIZE;
+ if (size > bytecount)
+ size = bytecount;
+
+ err = 0;
+ if (copy_to_user(ptr, mm->context.ldt, size))
+ err = -EFAULT;
+ mutex_unlock(&mm->context.lock);
+ if (err < 0)
+ goto error_return;
+ if (size != bytecount) {
+ /* zero-fill the rest */
+ if (clear_user(ptr + size, bytecount - size) != 0) {
+ err = -EFAULT;
+ goto error_return;
+ }
+ }
+ return bytecount;
+error_return:
+ return err;
+}
+
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
+{
+ /* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+ unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+ unsigned long size = 128;
+#endif
+ if (bytecount > size)
+ bytecount = size;
+ if (clear_user(ptr, bytecount))
+ return -EFAULT;
+ return bytecount;
+}
+
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+{
+ struct mm_struct *mm = current->mm;
+ struct desc_struct ldt;
+ int error;
+ struct user_desc ldt_info;
+
+ error = -EINVAL;
+ if (bytecount != sizeof(ldt_info))
+ goto out;
+ error = -EFAULT;
+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+ goto out;
+
+ error = -EINVAL;
+ if (ldt_info.entry_number >= LDT_ENTRIES)
+ goto out;
+ if (ldt_info.contents == 3) {
+ if (oldmode)
+ goto out;
+ if (ldt_info.seg_not_present == 0)
+ goto out;
+ }
+
+ mutex_lock(&mm->context.lock);
+ if (ldt_info.entry_number >= mm->context.size) {
+ error = alloc_ldt(¤t->mm->context,
+ ldt_info.entry_number + 1, 1);
+ if (error < 0)
+ goto out_unlock;
+ }
+
+ /* Allow LDTs to be cleared by the user. */
+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+ if (oldmode || LDT_empty(&ldt_info)) {
+ memset(&ldt, 0, sizeof(ldt));
+ goto install;
+ }
+ }
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
+
+ /* Install the new entry ... */
+install:
+ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+
+out_unlock:
+ mutex_unlock(&mm->context.lock);
+out:
+ return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+ unsigned long bytecount)
+{
+ int ret = -ENOSYS;
+
+ switch (func) {
+ case 0:
+ ret = read_ldt(ptr, bytecount);
+ break;
+ case 1:
+ ret = write_ldt(ptr, bytecount, 1);
+ break;
+ case 2:
+ ret = read_default_ldt(ptr, bytecount);
+ break;
+ case 0x11:
+ ret = write_ldt(ptr, bytecount, 0);
+ break;
+ }
+ return ret;
+}
#include <asm/desc.h>
#include <asm/system.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
static u32 kexec_pgd[1024] PAGE_ALIGNED;
#ifdef CONFIG_X86_PAE
static u32 kexec_pte0[1024] PAGE_ALIGNED;
static u32 kexec_pte1[1024] PAGE_ALIGNED;
-static void set_idt(void *newidt, __u16 limit)
-{
- struct desc_ptr curidt;
+#ifdef CONFIG_XEN
- /* ia32 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
- load_idt(&curidt);
-};
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
-static void set_gdt(void *newgdt, __u16 limit)
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
{
- struct desc_ptr curgdt;
+ void *control_page;
- /* ia32 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
+ memset(xki->page_list, 0, sizeof(xki->page_list));
- load_gdt(&curgdt);
-};
+ control_page = page_address(image->control_code_page);
+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
-static void load_segments(void)
+ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+ xki->page_list[PA_PGD] = __ma(kexec_pgd);
+#ifdef CONFIG_X86_PAE
+ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
+ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
+#endif
+ xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
+ xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+ struct resource *phys_cpus,
+ int nr_phys_cpus)
{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
- __asm__ __volatile__ (
- "\tljmp $"STR(__KERNEL_CS)",$1f\n"
- "\t1:\n"
- "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
- "\tmovl %%eax,%%ds\n"
- "\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
- "\tmovl %%eax,%%ss\n"
- ::: "eax", "memory");
-#undef STR
-#undef __STR
+ int k;
+
+ /* The per-cpu crash note resources belong to the hypervisor resource */
+ for (k = 0; k < nr_phys_cpus; k++)
+ request_resource(hypervisor, phys_cpus + k);
+
+ return 0;
}
+void machine_kexec_register_resources(struct resource *res) { ; }
+
+#endif /* CONFIG_XEN */
+
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
{
}
+#ifndef CONFIG_XEN
/*
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
page_list[PA_PTE_1] = __pa(kexec_pte1);
page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
- /* The segment registers are funny things, they have both a
- * visible and an invisible part. Whenever the visible part is
- * set to a specific selector, the invisible part is loaded
- * with from a table in memory. At no other time is the
- * descriptor table in memory accessed.
- *
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
- */
- load_segments();
- /* The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
-
- /* now call it */
relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
image->start, cpu_has_pae);
}
+#endif
void arch_crash_save_vmcoreinfo(void)
{
static u64 kexec_pmd1[512] PAGE_ALIGNED;
static u64 kexec_pte1[512] PAGE_ALIGNED;
+#ifdef CONFIG_XEN
+
+/* In the case of Xen, override hypervisor functions to be able to create
+ * a regular identity mapping page table...
+ */
+
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+
+#define x__pmd(x) ((pmd_t) { (x) } )
+#define x__pud(x) ((pud_t) { (x) } )
+#define x__pgd(x) ((pgd_t) { (x) } )
+
+#define x_pmd_val(x) ((x).pmd)
+#define x_pud_val(x) ((x).pud)
+#define x_pgd_val(x) ((x).pgd)
+
+static inline void x_set_pmd(pmd_t *dst, pmd_t val)
+{
+ x_pmd_val(*dst) = x_pmd_val(val);
+}
+
+static inline void x_set_pud(pud_t *dst, pud_t val)
+{
+ x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
+}
+
+static inline void x_pud_clear (pud_t *pud)
+{
+ x_pud_val(*pud) = 0;
+}
+
+static inline void x_set_pgd(pgd_t *dst, pgd_t val)
+{
+ x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
+}
+
+static inline void x_pgd_clear (pgd_t * pgd)
+{
+ x_pgd_val(*pgd) = 0;
+}
+
+#define X__PAGE_KERNEL_LARGE_EXEC \
+ _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
+#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+ void *control_page;
+ void *table_page;
+
+ memset(xki->page_list, 0, sizeof(xki->page_list));
+
+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+ table_page = page_address(image->control_code_page);
+
+ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+ xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
+
+ xki->page_list[PA_PGD] = __ma(kexec_pgd);
+ xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
+ xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
+ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
+ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
+ xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
+ xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+ struct resource *phys_cpus,
+ int nr_phys_cpus)
+{
+ int k;
+
+ /* The per-cpu crash note resources belong to the hypervisor resource */
+ for (k = 0; k < nr_phys_cpus; k++)
+ request_resource(hypervisor, phys_cpus + k);
+
+ return 0;
+}
+
+#else /* CONFIG_XEN */
+
+#define x__pmd(x) __pmd(x)
+#define x__pud(x) __pud(x)
+#define x__pgd(x) __pgd(x)
+
+#define x_set_pmd(x, y) set_pmd(x, y)
+#define x_set_pud(x, y) set_pud(x, y)
+#define x_set_pgd(x, y) set_pgd(x, y)
+
+#define x_pud_clear(x) pud_clear(x)
+#define x_pgd_clear(x) pgd_clear(x)
+
+#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#define X_KERNPG_TABLE _KERNPG_TABLE
+
+#endif /* CONFIG_XEN */
+
static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
unsigned long end_addr;
addr &= PAGE_MASK;
end_addr = addr + PUD_SIZE;
while (addr < end_addr) {
- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
addr += PMD_SIZE;
}
}
}
level2p = (pmd_t *)page_address(page);
init_level2_page(level2p, addr);
- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+ x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
addr += PUD_SIZE;
}
/* clear the unused entries */
while (addr < end_addr) {
- pud_clear(level3p++);
+ x_pud_clear(level3p++);
addr += PUD_SIZE;
}
out:
if (result) {
goto out;
}
- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+ x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
addr += PGDIR_SIZE;
}
/* clear the unused entries */
while (addr < end_addr) {
- pgd_clear(level4p++);
+ x_pgd_clear(level4p++);
addr += PGDIR_SIZE;
}
out:
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
pgd_t *level4p;
- level4p = (pgd_t *)__va(start_pgtable);
- return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
-}
-
-static void set_idt(void *newidt, u16 limit)
-{
- struct desc_ptr curidt;
-
- /* x86-64 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- __asm__ __volatile__ (
- "lidtq %0\n"
- : : "m" (curidt)
- );
-};
+ unsigned long x_end_pfn = end_pfn;
+#ifdef CONFIG_XEN
+ x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+#endif
-static void set_gdt(void *newgdt, u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* x86-64 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- __asm__ __volatile__ (
- "lgdtq %0\n"
- : : "m" (curgdt)
- );
-};
-
-static void load_segments(void)
-{
- __asm__ __volatile__ (
- "\tmovl %0,%%ds\n"
- "\tmovl %0,%%es\n"
- "\tmovl %0,%%ss\n"
- "\tmovl %0,%%fs\n"
- "\tmovl %0,%%gs\n"
- : : "a" (__KERNEL_DS) : "memory"
- );
+ level4p = (pgd_t *)__va(start_pgtable);
+ return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
}
int machine_kexec_prepare(struct kimage *image)
return;
}
+#ifndef CONFIG_XEN
/*
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
page_list[PA_TABLE_PAGE] =
(unsigned long)__pa(page_address(image->control_code_page));
- /* The segment registers are funny things, they have both a
- * visible and an invisible part. Whenever the visible part is
- * set to a specific selector, the invisible part is loaded
- * with from a table in memory. At no other time is the
- * descriptor table in memory accessed.
- *
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
- */
- load_segments();
- /* The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
-
- /* now call it */
relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
image->start);
}
+#endif
void arch_crash_save_vmcoreinfo(void)
{
--- /dev/null
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * This driver allows to upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
+ * Order Number 245472 or free download from:
+ *
+ * http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *
+ * For more information, go to http://www.urbanmyth.org/microcode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define DEBUG /* pr_debug */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+static int verbose;
+module_param(verbose, int, 0644);
+
+#define MICROCODE_VERSION "1.14a-xen"
+
+#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
+#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
+
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DEFINE_MUTEX(microcode_mutex);
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update (const void __user *ubuf, size_t len)
+{
+ int err;
+ void *kbuf;
+
+ kbuf = vmalloc(len);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (copy_from_user(kbuf, ubuf, len) == 0) {
+ struct xen_platform_op op;
+
+ op.cmd = XENPF_microcode_update;
+ set_xen_guest_handle(op.u.microcode.data, kbuf);
+ op.u.microcode.length = len;
+ err = HYPERVISOR_platform_op(&op);
+ } else
+ err = -EFAULT;
+
+ vfree(kbuf);
+
+ return err;
+}
+
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
+{
+ ssize_t ret;
+
+ if (len < MC_HEADER_SIZE) {
+ printk(KERN_ERR "microcode: not enough data\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(µcode_mutex);
+
+ ret = do_microcode_update(buf, len);
+ if (!ret)
+ ret = (ssize_t)len;
+
+ mutex_unlock(µcode_mutex);
+
+ return ret;
+}
+
+static const struct file_operations microcode_fops = {
+ .owner = THIS_MODULE,
+ .write = microcode_write,
+ .open = microcode_open,
+};
+
+static struct miscdevice microcode_dev = {
+ .minor = MICROCODE_MINOR,
+ .name = "microcode",
+ .fops = µcode_fops,
+};
+
+static int __init microcode_dev_init (void)
+{
+ int error;
+
+ error = misc_register(µcode_dev);
+ if (error) {
+ printk(KERN_ERR
+ "microcode: can't misc_register on minor=%d\n",
+ MICROCODE_MINOR);
+ return error;
+ }
+
+ return 0;
+}
+
+static void microcode_dev_exit (void)
+{
+ misc_deregister(µcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+static int request_microcode(void)
+{
+ char name[30];
+ const struct cpuinfo_x86 *c = &boot_cpu_data;
+ const struct firmware *firmware;
+ int error;
+ struct xen_platform_op op;
+
+ sprintf(name,"intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_mask);
+ error = request_firmware(&firmware, name, µcode_pdev->dev);
+ if (error) {
+ pr_debug("microcode: ucode data file %s load failed\n", name);
+ return error;
+ }
+
+ op.cmd = XENPF_microcode_update;
+ set_xen_guest_handle(op.u.microcode.data, firmware->data);
+ op.u.microcode.length = firmware->size;
+ error = HYPERVISOR_platform_op(&op);
+
+ release_firmware(firmware);
+
+ if (error)
+ pr_debug("ucode load failed\n");
+
+ return error;
+}
+
+static int __init microcode_init (void)
+{
+ int error;
+
+ error = microcode_dev_init();
+ if (error)
+ return error;
+ microcode_pdev = platform_device_register_simple("microcode", -1,
+ NULL, 0);
+ if (IS_ERR(microcode_pdev)) {
+ microcode_dev_exit();
+ return PTR_ERR(microcode_pdev);
+ }
+
+ request_microcode();
+
+ printk(KERN_INFO
+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+ return 0;
+}
+
+static void __exit microcode_exit (void)
+{
+ microcode_dev_exit();
+ platform_device_unregister(microcode_pdev);
+}
+
+module_init(microcode_init)
+module_exit(microcode_exit)
--- /dev/null
+/*
+ * Intel Multiprocessor Specification 1.1 and 1.4
+ * compliant MP-table parsing routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/bios_ebda.h>
+
+#include <mach_apic.h>
+#ifdef CONFIG_X86_32
+#include <mach_mpparse.h>
+#endif
+
+/* Have we found an MP table */
+int smp_found_config;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
+
+static int mp_current_pci_id;
+
+int pic_mode;
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+ int sum = 0;
+
+ while (len--)
+ sum += *mp++;
+
+ return sum & 0xFF;
+}
+
+#ifdef CONFIG_X86_NUMAQ
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
+ __cpuinitdata;
+#endif
+
+#ifndef CONFIG_XEN
+static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+{
+ int apicid;
+ char *bootup_cpu = "";
+
+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+ disabled_cpus++;
+ return;
+ }
+#ifdef CONFIG_X86_NUMAQ
+ apicid = mpc_apic_id(m, translation_table[mpc_record]);
+#else
+ apicid = m->mpc_apicid;
+#endif
+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+ bootup_cpu = " (Bootup-CPU)";
+ boot_cpu_physical_apicid = m->mpc_apicid;
+ }
+
+ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+ generic_processor_info(apicid, m->mpc_apicver);
+}
+#else
+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+{
+ num_processors++;
+}
+#endif /* CONFIG_XEN */
+
+static void __init MP_bus_info(struct mpc_config_bus *m)
+{
+ char str[7];
+
+ memcpy(str, m->mpc_bustype, 6);
+ str[6] = 0;
+
+#ifdef CONFIG_X86_NUMAQ
+ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+#else
+ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+#endif
+
+#if MAX_MP_BUSSES < 256
+ if (m->mpc_busid >= MAX_MP_BUSSES) {
+ printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+ " is too large, max. supported is %d\n",
+ m->mpc_busid, str, MAX_MP_BUSSES - 1);
+ return;
+ }
+#endif
+
+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
+ set_bit(m->mpc_busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+#endif
+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
+#ifdef CONFIG_X86_NUMAQ
+ mpc_oem_pci_bus(m, translation_table[mpc_record]);
+#endif
+ clear_bit(m->mpc_busid, mp_bus_not_pci);
+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+ mp_current_pci_id++;
+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+#endif
+ } else
+ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+ }
+ if (!address) {
+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
+static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
+{
+ if (!(m->mpc_flags & MPC_APIC_USABLE))
+ return;
+
+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+
+ if (bad_ioapic(m->mpc_apicaddr))
+ return;
+
+ mp_ioapics[nr_ioapics] = *m;
+ nr_ioapics++;
+}
+
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+{
+ mp_irqs[mp_irq_entries] = *m;
+ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->mpc_irqtype, m->mpc_irqflag & 3,
+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+#endif
+
+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
+{
+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ m->mpc_irqtype, m->mpc_irqflag & 3,
+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+}
+
+#ifdef CONFIG_X86_NUMAQ
+static void __init MP_translation_info(struct mpc_config_translation *m)
+{
+ printk(KERN_INFO
+ "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+ mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+ m->trans_local);
+
+ if (mpc_record >= MAX_MPC_ENTRY)
+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+ else
+ translation_table[mpc_record] = m; /* stash this for later */
+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+ node_set_online(m->trans_quad);
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+ unsigned short oemsize)
+{
+ int count = sizeof(*oemtable); /* the header size */
+ unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+ mpc_record = 0;
+ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
+ oemtable);
+ if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+ printk(KERN_WARNING
+ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+ oemtable->oem_signature[0], oemtable->oem_signature[1],
+ oemtable->oem_signature[2], oemtable->oem_signature[3]);
+ return;
+ }
+ if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+ return;
+ }
+ while (count < oemtable->oem_length) {
+ switch (*oemptr) {
+ case MP_TRANSLATION:
+ {
+ struct mpc_config_translation *m =
+ (struct mpc_config_translation *)oemptr;
+ MP_translation_info(m);
+ oemptr += sizeof(*m);
+ count += sizeof(*m);
+ ++mpc_record;
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING
+ "Unrecognised OEM table entry type! - %d\n",
+ (int)*oemptr);
+ return;
+ }
+ }
+ }
+}
+
+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid)
+{
+ if (strncmp(oem, "IBM NUMA", 8))
+ printk("Warning! May not be a NUMA-Q system!\n");
+ if (mpc->mpc_oemptr)
+ smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
+ mpc->mpc_oemsize);
+}
+#endif /* CONFIG_X86_NUMAQ */
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+ char str[16];
+ char oem[10];
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
+ printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+ mpc->mpc_signature[0], mpc->mpc_signature[1],
+ mpc->mpc_signature[2], mpc->mpc_signature[3]);
+ return 0;
+ }
+ if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
+ printk(KERN_ERR "MPTABLE: checksum error!\n");
+ return 0;
+ }
+ if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
+ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+ mpc->mpc_spec);
+ return 0;
+ }
+ if (!mpc->mpc_lapic) {
+ printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+ return 0;
+ }
+ memcpy(oem, mpc->mpc_oem, 8);
+ oem[8] = 0;
+ printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+
+ memcpy(str, mpc->mpc_productid, 12);
+ str[12] = 0;
+ printk("Product ID: %s ", str);
+
+#ifdef CONFIG_X86_32
+ mps_oem_check(mpc, oem, str);
+#endif
+ printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+
+#ifndef CONFIG_XEN
+ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+
+ /* save the local APIC address, it might be non-default */
+ if (!acpi_lapic)
+ mp_lapic_addr = mpc->mpc_lapic;
+#endif
+
+ if (early)
+ return 1;
+
+ /*
+ * Now process the configuration blocks.
+ */
+#ifdef CONFIG_X86_NUMAQ
+ mpc_record = 0;
+#endif
+ while (count < mpc->mpc_length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ {
+ struct mpc_config_processor *m =
+ (struct mpc_config_processor *)mpt;
+ /* ACPI may have already provided this data */
+ if (!acpi_lapic)
+ MP_processor_info(m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_BUS:
+ {
+ struct mpc_config_bus *m =
+ (struct mpc_config_bus *)mpt;
+ MP_bus_info(m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_IOAPIC:
+ {
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_ioapic *m =
+ (struct mpc_config_ioapic *)mpt;
+ MP_ioapic_info(m);
+#endif
+ mpt += sizeof(struct mpc_config_ioapic);
+ count += sizeof(struct mpc_config_ioapic);
+ break;
+ }
+ case MP_INTSRC:
+ {
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
+
+ MP_intsrc_info(m);
+#endif
+ mpt += sizeof(struct mpc_config_intsrc);
+ count += sizeof(struct mpc_config_intsrc);
+ break;
+ }
+ case MP_LINTSRC:
+ {
+ struct mpc_config_lintsrc *m =
+ (struct mpc_config_lintsrc *)mpt;
+ MP_lintsrc_info(m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ default:
+ /* wrong mptable */
+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+ printk(KERN_ERR "type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->mpc_length, 1);
+ count = mpc->mpc_length;
+ break;
+ }
+#ifdef CONFIG_X86_NUMAQ
+ ++mpc_record;
+#endif
+ }
+ setup_apic_routing();
+ if (!num_processors)
+ printk(KERN_ERR "MPTABLE: no processors registered!\n");
+ return num_processors;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+ unsigned int port;
+
+ port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+ struct mpc_config_intsrc intsrc;
+ int i;
+ int ELCR_fallback = 0;
+
+ intsrc.mpc_type = MP_INTSRC;
+ intsrc.mpc_irqflag = 0; /* conforming */
+ intsrc.mpc_srcbus = 0;
+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+
+ intsrc.mpc_irqtype = mp_INT;
+
+ /*
+ * If true, we have an ISA/PCI system with no IRQ entries
+ * in the MP table. To prevent the PCI interrupts from being set up
+ * incorrectly, we try to use the ELCR. The sanity check to see if
+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+ * never be level sensitive, so we simply see if the ELCR agrees.
+ * If it does, we assume it's valid.
+ */
+ if (mpc_default_type == 5) {
+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
+ "falling back to ELCR\n");
+
+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
+ ELCR_trigger(13))
+ printk(KERN_ERR "ELCR contains invalid data... "
+ "not using ELCR\n");
+ else {
+ printk(KERN_INFO
+ "Using ELCR to identify PCI interrupts\n");
+ ELCR_fallback = 1;
+ }
+ }
+
+ for (i = 0; i < 16; i++) {
+ switch (mpc_default_type) {
+ case 2:
+ if (i == 0 || i == 13)
+ continue; /* IRQ0 & IRQ13 not connected */
+ /* fall through */
+ default:
+ if (i == 2)
+ continue; /* IRQ2 is never connected */
+ }
+
+ if (ELCR_fallback) {
+ /*
+ * If the ELCR indicates a level-sensitive interrupt, we
+ * copy that information over to the MP table in the
+ * irqflag field (level sensitive, active high polarity).
+ */
+ if (ELCR_trigger(i))
+ intsrc.mpc_irqflag = 13;
+ else
+ intsrc.mpc_irqflag = 0;
+ }
+
+ intsrc.mpc_srcbusirq = i;
+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+ MP_intsrc_info(&intsrc);
+ }
+
+ intsrc.mpc_irqtype = mp_ExtINT;
+ intsrc.mpc_srcbusirq = 0;
+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
+ MP_intsrc_info(&intsrc);
+}
+
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+ struct mpc_config_processor processor;
+ struct mpc_config_bus bus;
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_ioapic ioapic;
+#endif
+ struct mpc_config_lintsrc lintsrc;
+ int linttypes[2] = { mp_ExtINT, mp_NMI };
+ int i;
+
+#ifndef CONFIG_XEN
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+#endif
+
+ /*
+ * 2 CPUs, numbered 0 & 1.
+ */
+ processor.mpc_type = MP_PROCESSOR;
+ /* Either an integrated APIC or a discrete 82489DX. */
+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.mpc_cpuflag = CPU_ENABLED;
+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_reserved[0] = 0;
+ processor.mpc_reserved[1] = 0;
+ for (i = 0; i < 2; i++) {
+ processor.mpc_apicid = i;
+ MP_processor_info(&processor);
+ }
+
+ bus.mpc_type = MP_BUS;
+ bus.mpc_busid = 0;
+ switch (mpc_default_type) {
+ default:
+ printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+ mpc_default_type);
+ /* fall through */
+ case 1:
+ case 5:
+ memcpy(bus.mpc_bustype, "ISA ", 6);
+ break;
+ case 2:
+ case 6:
+ case 3:
+ memcpy(bus.mpc_bustype, "EISA ", 6);
+ break;
+ case 4:
+ case 7:
+ memcpy(bus.mpc_bustype, "MCA ", 6);
+ }
+ MP_bus_info(&bus);
+ if (mpc_default_type > 4) {
+ bus.mpc_busid = 1;
+ memcpy(bus.mpc_bustype, "PCI ", 6);
+ MP_bus_info(&bus);
+ }
+
+#ifdef CONFIG_X86_IO_APIC
+ ioapic.mpc_type = MP_IOAPIC;
+ ioapic.mpc_apicid = 2;
+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.mpc_flags = MPC_APIC_USABLE;
+ ioapic.mpc_apicaddr = 0xFEC00000;
+ MP_ioapic_info(&ioapic);
+
+ /*
+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
+ */
+ construct_default_ioirq_mptable(mpc_default_type);
+#endif
+ lintsrc.mpc_type = MP_LINTSRC;
+ lintsrc.mpc_irqflag = 0; /* conforming */
+ lintsrc.mpc_srcbusid = 0;
+ lintsrc.mpc_srcbusirq = 0;
+ lintsrc.mpc_destapic = MP_APIC_ALL;
+ for (i = 0; i < 2; i++) {
+ lintsrc.mpc_irqtype = linttypes[i];
+ lintsrc.mpc_destapiclint = i;
+ MP_lintsrc_info(&lintsrc);
+ }
+}
+
+static struct intel_mp_floating *mpf_found;
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+static void __init __get_smp_config(unsigned early)
+{
+ struct intel_mp_floating *mpf = mpf_found;
+
+ if (acpi_lapic && early)
+ return;
+ /*
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * processors, where MPS only supports physical.
+ */
+ if (acpi_lapic && acpi_ioapic) {
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
+ return;
+ } else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
+
+ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
+ mpf->mpf_specification);
+#ifdef CONFIG_X86_32
+ if (mpf->mpf_feature2 & (1 << 7)) {
+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
+ pic_mode = 1;
+ } else {
+ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
+ pic_mode = 0;
+ }
+#endif
+ /*
+ * Now see if we need to read further.
+ */
+ if (mpf->mpf_feature1 != 0) {
+ if (early) {
+#ifndef CONFIG_XEN
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+#endif
+ return;
+ }
+
+ printk(KERN_INFO "Default MP configuration #%d\n",
+ mpf->mpf_feature1);
+ construct_default_ISA_mptable(mpf->mpf_feature1);
+
+ } else if (mpf->mpf_physptr) {
+
+ /*
+ * Read the physical hardware table. Anything here will
+ * override the defaults.
+ */
+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
+ smp_found_config = 0;
+ printk(KERN_ERR
+ "BIOS bug, MP table errors detected!...\n");
+ printk(KERN_ERR "... disabling SMP support. "
+ "(tell your hw vendor)\n");
+ return;
+ }
+
+ if (early)
+ return;
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * If there are no explicit MP IRQ entries, then we are
+ * broken. We set up most of the low 16 IO-APIC pins to
+ * ISA defaults and hope it will work.
+ */
+ if (!mp_irq_entries) {
+ struct mpc_config_bus bus;
+
+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+ "using default mptable. "
+ "(tell your hw vendor)\n");
+
+ bus.mpc_type = MP_BUS;
+ bus.mpc_busid = 0;
+ memcpy(bus.mpc_bustype, "ISA ", 6);
+ MP_bus_info(&bus);
+
+ construct_default_ioirq_mptable(0);
+ }
+#endif
+ } else
+ BUG();
+
+ if (!early)
+ printk(KERN_INFO "Processors: %d\n", num_processors);
+ /*
+ * Only use the first configuration found.
+ */
+}
+
+void __init early_get_smp_config(void)
+{
+ __get_smp_config(1);
+}
+
+void __init get_smp_config(void)
+{
+ __get_smp_config(0);
+}
+
+static int __init smp_scan_config(unsigned long base, unsigned long length,
+ unsigned reserve)
+{
+ unsigned int *bp = isa_bus_to_virt(base);
+ struct intel_mp_floating *mpf;
+
+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+ BUILD_BUG_ON(sizeof(*mpf) != 16);
+
+ while (length > 0) {
+ mpf = (struct intel_mp_floating *)bp;
+ if ((*bp == SMP_MAGIC_IDENT) &&
+ (mpf->mpf_length == 1) &&
+ !mpf_checksum((unsigned char *)bp, 16) &&
+ ((mpf->mpf_specification == 1)
+ || (mpf->mpf_specification == 4))) {
+
+ smp_found_config = 1;
+ mpf_found = mpf;
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+ mpf, virt_to_phys(mpf));
+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+ BOOTMEM_DEFAULT);
+ if (mpf->mpf_physptr) {
+ /*
+ * We cannot access to MPC table to compute
+ * table size yet, as only few megabytes from
+ * the bottom is mapped now.
+ * PC-9800's MPC table places on the very last
+ * of physical memory; so that simply reserving
+ * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+ * in reserve_bootmem.
+ */
+ unsigned long size = PAGE_SIZE;
+ unsigned long end = max_low_pfn * PAGE_SIZE;
+ if (mpf->mpf_physptr + size > end)
+ size = end - mpf->mpf_physptr;
+ reserve_bootmem(mpf->mpf_physptr, size,
+ BOOTMEM_DEFAULT);
+ }
+#else
+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+ mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
+#endif
+#elif !defined(CONFIG_XEN)
+ if (!reserve)
+ return 1;
+
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+ if (mpf->mpf_physptr)
+ reserve_bootmem_generic(mpf->mpf_physptr,
+ PAGE_SIZE);
+#endif
+ return 1;
+ }
+ bp += 4;
+ length -= 16;
+ }
+ return 0;
+}
+
+static void __init __find_smp_config(unsigned reserve)
+{
+#ifndef CONFIG_XEN
+ unsigned int address;
+#endif
+
+ /*
+ * FIXME: Linux assumes you have 640K of base ram..
+ * this continues the error...
+ *
+ * 1) Scan the bottom 1K for a signature
+ * 2) Scan the top 1K of base RAM
+ * 3) Scan the 64K of bios
+ */
+ if (smp_scan_config(0x0, 0x400, reserve) ||
+ smp_scan_config(639 * 0x400, 0x400, reserve) ||
+ smp_scan_config(0xF0000, 0x10000, reserve))
+ return;
+ /*
+ * If it is an SMP machine we should know now, unless the
+ * configuration is in an EISA/MCA bus machine with an
+ * extended bios data area.
+ *
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E, calculate and scan it here.
+ *
+ * NOTE! There are Linux loaders that will corrupt the EBDA
+ * area, and as such this kind of SMP config may be less
+ * trustworthy, simply because the SMP table may have been
+ * stomped on during early boot. These loaders are buggy and
+ * should be fixed.
+ *
+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+ */
+
+#ifndef CONFIG_XEN
+ address = get_bios_ebda();
+ if (address)
+ smp_scan_config(address, 0x400, reserve);
+#endif
+}
+
+void __init early_find_smp_config(void)
+{
+ __find_smp_config(0);
+}
+
+void __init find_smp_config(void)
+{
+ __find_smp_config(1);
+}
+
+/* --------------------------------------------------------------------------
+ ACPI-based MP Configuration
+ -------------------------------------------------------------------------- */
+
+/*
+ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
+ */
+int es7000_plat;
+
+#ifdef CONFIG_ACPI
+
+#ifdef CONFIG_X86_IO_APIC
+
+#define MP_ISA_BUS 0
+
+extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+ int i = 0;
+
+ /* Find the IOAPIC that manages this GSI. */
+ for (i = 0; i < nr_ioapics; i++) {
+ if ((gsi >= mp_ioapic_routing[i].gsi_base)
+ && (gsi <= mp_ioapic_routing[i].gsi_end))
+ return i;
+ }
+
+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
+}
+
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return io_apic_get_unique_id(nr_ioapics, id);
+ else
+#endif
+ return id;
+#else
+ int i;
+ DECLARE_BITMAP(used, 256);
+ bitmap_zero(used, 256);
+ for (i = 0; i < nr_ioapics; i++) {
+ struct mpc_config_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->mpc_apicid, used);
+ }
+ if (!test_bit(id, used))
+ return id;
+ return find_first_zero_bit(used, 256);
+#endif
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+ int idx = 0;
+
+ if (bad_ioapic(address))
+ return;
+
+ idx = nr_ioapics;
+
+ mp_ioapics[idx].mpc_type = MP_IOAPIC;
+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+ mp_ioapics[idx].mpc_apicaddr = address;
+
+#ifndef CONFIG_XEN
+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+#endif
+ mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+#else
+ mp_ioapics[idx].mpc_apicver = 0;
+#endif
+ /*
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+ */
+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+ mp_ioapic_routing[idx].gsi_base = gsi_base;
+ mp_ioapic_routing[idx].gsi_end = gsi_base +
+ io_apic_get_redir_entries(idx);
+
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+ nr_ioapics++;
+}
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+ struct mpc_config_intsrc intsrc;
+ int ioapic = -1;
+ int pin = -1;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return;
+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ intsrc.mpc_type = MP_INTSRC;
+ intsrc.mpc_irqtype = mp_INT;
+ intsrc.mpc_irqflag = (trigger << 2) | polarity;
+ intsrc.mpc_srcbus = MP_ISA_BUS;
+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
+ intsrc.mpc_dstirq = pin; /* INTIN# */
+
+ MP_intsrc_info(&intsrc);
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+ struct mpc_config_intsrc intsrc;
+ int i = 0;
+ int ioapic = -1;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+ /*
+ * Fabricate the legacy ISA bus (bus #31).
+ */
+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+ /*
+ * Older generations of ES7000 have no legacy identity mappings
+ */
+ if (es7000_plat == 1)
+ return;
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
+ */
+ ioapic = mp_find_ioapic(0);
+ if (ioapic < 0)
+ return;
+
+ intsrc.mpc_type = MP_INTSRC;
+ intsrc.mpc_irqflag = 0; /* Conforming */
+ intsrc.mpc_srcbus = MP_ISA_BUS;
+#ifdef CONFIG_X86_IO_APIC
+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+#endif
+ /*
+ * Use the default configuration for the IRQs 0-15. Unless
+ * overridden by (MADT) interrupt source override entries.
+ */
+ for (i = 0; i < 16; i++) {
+ int idx;
+
+ for (idx = 0; idx < mp_irq_entries; idx++) {
+ struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+ /* Do we already have a mapping for this ISA IRQ? */
+ if (irq->mpc_srcbus == MP_ISA_BUS
+ && irq->mpc_srcbusirq == i)
+ break;
+
+ /* Do we already have a mapping for this IOAPIC pin */
+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+ (irq->mpc_dstirq == i))
+ break;
+ }
+
+ if (idx != mp_irq_entries) {
+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ continue; /* IRQ already used */
+ }
+
+ intsrc.mpc_irqtype = mp_INT;
+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
+ intsrc.mpc_dstirq = i;
+
+ MP_intsrc_info(&intsrc);
+ }
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+#ifdef CONFIG_X86_32
+#define MAX_GSI_NUM 4096
+#define IRQ_COMPRESSION_START 64
+
+ static int pci_irq = IRQ_COMPRESSION_START;
+ /*
+ * Mapping between Global System Interrupts, which
+ * represent all possible interrupts, and IRQs
+ * assigned to actual devices.
+ */
+ static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return gsi;
+#endif
+
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return gsi;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+ return gsi;
+ }
+
+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+ if (ioapic_renumber_irq)
+ gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
+
+ /*
+ * Avoid pin reprogramming. PRTs typically include entries
+ * with redundant pin->gsi mappings (but unique PCI devices);
+ * we only program the IOAPIC on the first.
+ */
+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+ ioapic_pin);
+ return gsi;
+ }
+ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#ifdef CONFIG_X86_32
+ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+ return gsi;
+#endif
+ }
+
+ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#ifdef CONFIG_X86_32
+ /*
+ * For GSI >= 64, use IRQ compression
+ */
+ if ((gsi >= IRQ_COMPRESSION_START)
+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
+ /*
+ * For PCI devices assign IRQs in order, avoiding gaps
+ * due to unused I/O APIC pins.
+ */
+ int irq = gsi;
+ if (gsi < MAX_GSI_NUM) {
+ /*
+ * Retain the VIA chipset work-around (gsi > 15), but
+ * avoid a problem where the 8254 timer (IRQ0) is setup
+ * via an override (so it's not on pin 0 of the ioapic),
+ * and at the same time, the pin 0 interrupt is a PCI
+ * type. The gsi > 15 test could cause these two pins
+ * to be shared as IRQ0, and they are not shareable.
+ * So test for this condition, and if necessary, avoid
+ * the pin collision.
+ */
+ gsi = pci_irq++;
+ /*
+ * Don't assign IRQ used by ACPI SCI
+ */
+ if (gsi == acpi_gbl_FADT.sci_interrupt)
+ gsi = pci_irq++;
+ gsi_to_irq[irq] = gsi;
+ } else {
+ printk(KERN_ERR "GSI %u is too high\n", gsi);
+ return gsi;
+ }
+ }
+#endif
+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ return gsi;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
#include "mach_traps.h"
-int unknown_nmi_panic;
+extern void die_nmi(struct pt_regs *, const char *msg);
+
+#ifndef CONFIG_XEN
+
int nmi_watchdog_enabled;
static cpumask_t backtrace_mask = CPU_MASK_NONE;
}
EXPORT_SYMBOL(touch_nmi_watchdog);
-extern void die_nmi(struct pt_regs *, const char *msg);
-
notrace __kprobes int
nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
{
return rc;
}
+#endif /* CONFIG_XEN */
+
#ifdef CONFIG_SYSCTL
+int unknown_nmi_panic;
+
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
{
unsigned char reason = get_nmi_reason();
return 0;
}
+#ifndef CONFIG_XEN
/*
* proc handler for /proc/sys/kernel/nmi
*/
}
return 0;
}
+#endif
#endif
return 0;
}
+#ifndef CONFIG_XEN
void __trigger_all_cpu_backtrace(void)
{
int i;
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
+#endif
#include <mach_traps.h>
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
int panic_on_unrecovered_nmi;
+#ifndef CONFIG_XEN
+
+int nmi_watchdog_enabled;
+
static cpumask_t backtrace_mask = CPU_MASK_NONE;
/* nmi_active:
return rc;
}
+#endif /* CONFIG_XEN */
+
static unsigned ignore_nmis;
asmlinkage notrace __kprobes void
#ifdef CONFIG_SYSCTL
+int unknown_nmi_panic;
+
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
{
unsigned char reason = get_nmi_reason();
return 0;
}
+#ifndef CONFIG_XEN
/*
* proc handler for /proc/sys/kernel/nmi
*/
}
return 0;
}
+#endif
#endif
return 0;
}
+#ifndef CONFIG_XEN
void __trigger_all_cpu_backtrace(void)
{
int i;
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
+#endif
--- /dev/null
+#include <linux/dma-mapping.h>
+#include <linux/dmar.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/gart.h>
+#include <asm/calgary.h>
+
+int forbid_dac __read_mostly;
+EXPORT_SYMBOL(forbid_dac);
+
+const struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+static int iommu_sac_force __read_mostly;
+
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly = 0;
+#endif
+
+int iommu_merge __read_mostly = 0;
+
+int no_iommu __read_mostly;
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/* This tells the BIO block layer to assume merging. Default to off
+ because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+dma_addr_t bad_dma_address __read_mostly = 0;
+EXPORT_SYMBOL(bad_dma_address);
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+ be probably a smaller DMA mask, but this is bug-to-bug compatible
+ to older i386. */
+struct device fallback_dev = {
+ .bus_id = "fallback device",
+ .coherent_dma_mask = DMA_32BIT_MASK,
+ .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+int dma_set_mask(struct device *dev, u64 mask)
+{
+ if (!dev->dma_mask || !dma_supported(dev, mask))
+ return -EIO;
+
+ *dev->dma_mask = mask;
+
+ return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+static __initdata void *dma32_bootmem_ptr;
+static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
+
+static int __init parse_dma32_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ dma32_bootmem_size = memparse(p, &p);
+ return 0;
+}
+early_param("dma32_size", parse_dma32_size_opt);
+
+void __init dma32_reserve_bootmem(void)
+{
+ unsigned long size, align;
+ if (end_pfn <= MAX_DMA32_PFN)
+ return;
+
+ align = 64ULL<<20;
+ size = round_up(dma32_bootmem_size, align);
+ dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
+ __pa(MAX_DMA_ADDRESS));
+ if (dma32_bootmem_ptr)
+ dma32_bootmem_size = size;
+ else
+ dma32_bootmem_size = 0;
+}
+static void __init dma32_free_bootmem(void)
+{
+ int node;
+
+ if (end_pfn <= MAX_DMA32_PFN)
+ return;
+
+ if (!dma32_bootmem_ptr)
+ return;
+
+ for_each_online_node(node)
+ free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
+ dma32_bootmem_size);
+
+ dma32_bootmem_ptr = NULL;
+ dma32_bootmem_size = 0;
+}
+#else
+#define dma32_free_bootmem() ((void)0)
+#endif
+
+static const struct dma_mapping_ops swiotlb_dma_ops = {
+ .mapping_error = swiotlb_dma_mapping_error,
+ .map_single = swiotlb_map_single_phys,
+ .unmap_single = swiotlb_unmap_single,
+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = swiotlb_sync_single_for_device,
+ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+ .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
+ .map_sg = swiotlb_map_sg,
+ .unmap_sg = swiotlb_unmap_sg,
+ .dma_supported = swiotlb_dma_supported
+};
+
+void __init pci_iommu_alloc(void)
+{
+ /* free the range so iommu could get some range less than 4G */
+ dma32_free_bootmem();
+ /*
+ * The order of these functions is important for
+ * fall-back/fail-over reasons
+ */
+#ifdef CONFIG_GART_IOMMU
+ gart_iommu_hole_init();
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+ detect_calgary();
+#endif
+
+ detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+ swiotlb_init();
+ if (swiotlb) {
+ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+ dma_ops = &swiotlb_dma_ops;
+ }
+#endif
+}
+
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
+static __init int iommu_setup(char *p)
+{
+ iommu_merge = 1;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p, "off", 3))
+ no_iommu = 1;
+ /* gart_parse_options has more force support */
+ if (!strncmp(p, "force", 5))
+ force_iommu = 1;
+ if (!strncmp(p, "noforce", 7)) {
+ iommu_merge = 0;
+ force_iommu = 0;
+ }
+
+ if (!strncmp(p, "biomerge", 8)) {
+ iommu_bio_merge = 4096;
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "panic", 5))
+ panic_on_overflow = 1;
+ if (!strncmp(p, "nopanic", 7))
+ panic_on_overflow = 0;
+ if (!strncmp(p, "merge", 5)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "nomerge", 7))
+ iommu_merge = 0;
+ if (!strncmp(p, "forcesac", 8))
+ iommu_sac_force = 1;
+ if (!strncmp(p, "allowdac", 8))
+ forbid_dac = 0;
+ if (!strncmp(p, "nodac", 5))
+ forbid_dac = -1;
+ if (!strncmp(p, "usedac", 6)) {
+ forbid_dac = -1;
+ return 1;
+ }
+#ifdef CONFIG_SWIOTLB
+ if (!strncmp(p, "soft", 4))
+ swiotlb = 1;
+#endif
+
+#ifdef CONFIG_GART_IOMMU
+ gart_parse_options(p);
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+ if (!strncmp(p, "calgary", 7))
+ use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+ p += strcspn(p, ",");
+ if (*p == ',')
+ ++p;
+ }
+ return 0;
+}
+early_param("iommu", iommu_setup);
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+ unsigned int offset,
+ size_t length)
+{
+ unsigned long next_mfn;
+ int i;
+ int nr_pages;
+
+ next_mfn = pfn_to_mfn(pfn);
+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ for (i = 1; i < nr_pages; i++) {
+ if (pfn_to_mfn(++pfn) != ++next_mfn)
+ return 0;
+ }
+ return 1;
+}
+
+int range_straddles_page_boundary(paddr_t p, size_t size)
+{
+ extern unsigned long *contiguous_bitmap;
+ unsigned long pfn = p >> PAGE_SHIFT;
+ unsigned int offset = p & ~PAGE_MASK;
+
+ if (offset + size <= PAGE_SIZE)
+ return 0;
+ if (test_bit(pfn, contiguous_bitmap))
+ return 0;
+ if (check_pages_physically_contiguous(pfn, offset, size))
+ return 0;
+ return 1;
+}
+
+#ifdef CONFIG_X86_32
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+ dma_addr_t device_addr, size_t size, int flags)
+{
+ void __iomem *mem_base = NULL;
+ int pages = size >> PAGE_SHIFT;
+ int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+ goto out;
+ if (!size)
+ goto out;
+ if (dev->dma_mem)
+ goto out;
+
+ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+ mem_base = ioremap(bus_addr, size);
+ if (!mem_base)
+ goto out;
+
+ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+ if (!dev->dma_mem)
+ goto out;
+ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dev->dma_mem->bitmap)
+ goto free1_out;
+
+ dev->dma_mem->virt_base = mem_base;
+ dev->dma_mem->device_base = device_addr;
+ dev->dma_mem->size = pages;
+ dev->dma_mem->flags = flags;
+
+ if (flags & DMA_MEMORY_MAP)
+ return DMA_MEMORY_MAP;
+
+ return DMA_MEMORY_IO;
+
+ free1_out:
+ kfree(dev->dma_mem);
+ out:
+ if (mem_base)
+ iounmap(mem_base);
+ return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+
+ if (!mem)
+ return;
+ dev->dma_mem = NULL;
+ iounmap(mem->virt_base);
+ kfree(mem->bitmap);
+ kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+ dma_addr_t device_addr, size_t size)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+ int pos, err;
+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
+
+ pages >>= PAGE_SHIFT;
+
+ if (!mem)
+ return ERR_PTR(-EINVAL);
+
+ pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+ if (err != 0)
+ return ERR_PTR(err);
+ return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle, void **ret)
+{
+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+ int order = get_order(size);
+
+ if (mem) {
+ int page = bitmap_find_free_region(mem->bitmap, mem->size,
+ order);
+ if (page >= 0) {
+ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+ *ret = mem->virt_base + (page << PAGE_SHIFT);
+ memset(*ret, 0, size);
+ }
+ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+ *ret = NULL;
+ }
+ return (mem != NULL);
+}
+
+static int dma_release_coherent(struct device *dev, int order, void *vaddr)
+{
+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
+ if (mem && vaddr >= mem->virt_base && vaddr <
+ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+ bitmap_release_region(mem->bitmap, page, order);
+ return 1;
+ }
+ return 0;
+}
+#else
+#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
+#define dma_release_coherent(dev, order, vaddr) (0)
+#endif /* CONFIG_X86_32 */
+
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+ if (mask > 0xffffffff && forbid_dac > 0) {
+ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
+ dev->bus_id);
+ return 0;
+ }
+#endif
+
+ if (dma_ops->dma_supported)
+ return dma_ops->dma_supported(dev, mask);
+
+ /* Copied from i386. Doesn't make much sense, because it will
+ only work for pci_alloc_coherent.
+ The caller just has to use GFP_DMA in this case. */
+ if (mask < DMA_24BIT_MASK)
+ return 0;
+
+ /* Tell the device to use SAC when IOMMU force is on. This
+ allows the driver to use cheaper accesses in some cases.
+
+ Problem with this is that if we overflow the IOMMU area and
+ return DAC as fallback address the device may not handle it
+ correctly.
+
+ As a special case some controllers have a 39bit address
+ mode that is as efficient as 32bit (aic79xx). Don't force
+ SAC for these. Assume all masks <= 40 bits are of this
+ type. Normally this doesn't make any difference, but gives
+ more gentle handling of IOMMU overflow. */
+ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+ printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
+ dev->bus_id, mask);
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+/* Allocate DMA memory on node near device */
+static struct page *
+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
+{
+ int node;
+
+ node = dev_to_node(dev);
+
+ return alloc_pages_node(node, gfp, order);
+}
+
+/*
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp)
+{
+ void *memory = NULL;
+ struct page *page;
+ unsigned long dma_mask = 0;
+ int noretry = 0;
+ unsigned int order = get_order(size);
+
+ /* ignore region specifiers */
+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+ if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
+ return memory;
+
+ if (!dev) {
+ dev = &fallback_dev;
+ gfp |= GFP_DMA;
+ }
+ dma_mask = dev->coherent_dma_mask;
+ if (dma_mask == 0)
+ dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
+
+ /* Device not DMA able */
+ if (dev->dma_mask == NULL)
+ return NULL;
+
+ /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
+ if (gfp & __GFP_DMA)
+ noretry = 1;
+
+#ifdef CONFIG_XEN
+ gfp &= ~(__GFP_DMA | __GFP_DMA32);
+#else
+#ifdef CONFIG_X86_64
+ /* Why <=? Even when the mask is smaller than 4GB it is often
+ larger than 16MB and in this case we have a chance of
+ finding fitting memory in the next higher zone first. If
+ not retry with true GFP_DMA. -AK */
+ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+ gfp |= GFP_DMA32;
+#endif
+
+ again:
+#endif
+ page = dma_alloc_pages(dev,
+ noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
+ if (page == NULL)
+ return NULL;
+
+#ifndef CONFIG_XEN
+ {
+ int high, mmu;
+ dma_addr_t bus = page_to_phys(page);
+ memory = page_address(page);
+ high = (bus + size) >= dma_mask;
+ mmu = high;
+ if (force_iommu && !(gfp & GFP_DMA))
+ mmu = 1;
+ else if (high) {
+ free_pages((unsigned long)memory, order);
+
+ /* Don't use the 16MB ZONE_DMA unless absolutely
+ needed. It's better to use remapping first. */
+ if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+ goto again;
+ }
+
+ /* Let low level make its own zone decisions */
+ gfp &= ~(GFP_DMA32|GFP_DMA);
+
+ if (dma_ops->alloc_coherent)
+ return dma_ops->alloc_coherent(dev, size,
+ dma_handle, gfp);
+ return NULL;
+ }
+
+ memset(memory, 0, size);
+ if (!mmu) {
+ *dma_handle = bus;
+ return memory;
+ }
+ }
+
+ if (dma_ops->alloc_coherent) {
+ free_pages((unsigned long)memory, order);
+ gfp &= ~(GFP_DMA|GFP_DMA32);
+ return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+ }
+
+ if (dma_ops->map_simple) {
+ *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
+ size,
+ PCI_DMA_BIDIRECTIONAL);
+ if (*dma_handle != bad_dma_address)
+ return memory;
+ }
+#else
+ memory = page_address(page);
+ if (xen_create_contiguous_region((unsigned long)memory, order,
+ fls64(dma_mask)) == 0) {
+ memset(memory, 0, size);
+ *dma_handle = virt_to_bus(memory);
+ return memory;
+ }
+#endif
+
+ if (panic_on_overflow)
+ panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
+ (unsigned long)size);
+ free_pages((unsigned long)memory, get_order(size));
+ return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t bus)
+{
+ int order = get_order(size);
+ WARN_ON(irqs_disabled()); /* for portability */
+ if (dma_release_coherent(dev, order, vaddr))
+ return;
+#ifndef CONFIG_XEN
+ if (dma_ops->unmap_single)
+ dma_ops->unmap_single(dev, bus, size, 0);
+#endif
+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
+ free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_CALGARY_IOMMU
+ calgary_iommu_init();
+#endif
+
+ intel_iommu_init();
+
+#ifdef CONFIG_GART_IOMMU
+ gart_iommu_init();
+#endif
+
+ no_iommu_init();
+ return 0;
+}
+
+void pci_iommu_shutdown(void)
+{
+ gart_iommu_shutdown();
+}
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+ printk(KERN_INFO "PCI: VIA PCI bridge detected."
+ "Disabling DAC.\n");
+ forbid_dac = 1;
+ }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+#endif
--- /dev/null
+#include <linux/dma-mapping.h>
+#include <linux/dmar.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+
+#include <xen/gnttab.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/swiotlb.h>
+#include <asm/tlbflush.h>
+#include <asm/gnttab_dma.h>
+#include <asm/bug.h>
+
+#define IOMMU_BUG_ON(test) \
+do { \
+ if (unlikely(test)) { \
+ printk(KERN_ALERT "Fatal DMA error! " \
+ "Please use 'swiotlb=force'\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+static int
+gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+ int direction)
+{
+ unsigned int i;
+ struct scatterlist *sg;
+
+ WARN_ON(nents == 0 || sgl->length == 0);
+
+ for_each_sg(sgl, sg, nents, i) {
+ BUG_ON(!sg_page(sg));
+ sg->dma_address =
+ gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+ sg->dma_length = sg->length;
+ IOMMU_BUG_ON(address_needs_mapping(
+ hwdev, sg->dma_address));
+ IOMMU_BUG_ON(range_straddles_page_boundary(
+ page_to_pseudophys(sg_page(sg)) + sg->offset,
+ sg->length));
+ }
+
+ return nents;
+}
+
+static void
+gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+ int direction)
+{
+ unsigned int i;
+ struct scatterlist *sg;
+
+ for_each_sg(sgl, sg, nents, i)
+ gnttab_dma_unmap_page(sg->dma_address);
+}
+
+static dma_addr_t
+gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
+ int direction)
+{
+ dma_addr_t dma;
+
+ WARN_ON(size == 0);
+
+ dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
+ offset_in_page(paddr);
+ IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
+ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
+
+ return dma;
+}
+
+static void
+gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+ int direction)
+{
+ gnttab_dma_unmap_page(dma_addr);
+}
+
+static int nommu_mapping_error(dma_addr_t dma_addr)
+{
+ return (dma_addr == bad_dma_address);
+}
+
+static const struct dma_mapping_ops nommu_dma_ops = {
+ .map_single = gnttab_map_single,
+ .unmap_single = gnttab_unmap_single,
+ .map_sg = gnttab_map_sg,
+ .unmap_sg = gnttab_unmap_sg,
+ .dma_supported = swiotlb_dma_supported,
+ .mapping_error = nommu_mapping_error
+};
+
+void __init no_iommu_init(void)
+{
+ if (dma_ops)
+ return;
+
+ force_iommu = 0; /* no HW IOMMU */
+ dma_ops = &nommu_dma_ops;
+}
struct platform_device *pd;
int ret;
+#ifdef CONFIG_XEN
+ if (!is_initial_xendomain())
+ return 0;
+#endif
+
pd = platform_device_alloc("pcspkr", -1);
if (!pd)
return -ENOMEM;
--- /dev/null
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+
+struct kmem_cache *task_xstate_cachep;
+
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+ *dst = *src;
+ if (src->thread.xstate) {
+ dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+ GFP_KERNEL);
+ if (!dst->thread.xstate)
+ return -ENOMEM;
+ WARN_ON((unsigned long)dst->thread.xstate & 15);
+ memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+ }
+ return 0;
+}
+
+void free_thread_xstate(struct task_struct *tsk)
+{
+ if (tsk->thread.xstate) {
+ kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
+ tsk->thread.xstate = NULL;
+ }
+}
+
+void free_thread_info(struct thread_info *ti)
+{
+ free_thread_xstate(ti->task);
+ free_pages((unsigned long)ti, get_order(THREAD_SIZE));
+}
+
+void arch_task_cache_init(void)
+{
+ task_xstate_cachep =
+ kmem_cache_create("task_xstate", xstate_size,
+ __alignof__(union thread_xstate),
+ SLAB_PANIC, NULL);
+}
+
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
+void cpu_idle_wait(void)
+{
+ smp_mb();
+ /* kick all the CPUs so that they exit out of pm_idle */
+ smp_call_function(do_nothing, NULL, 0, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+#ifndef CONFIG_XEN
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+ if (!need_resched()) {
+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __mwait(ax, cx);
+ }
+}
+
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+ if (!need_resched()) {
+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
+ } else
+ local_irq_enable();
+}
+#endif
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle(void)
+{
+ local_irq_enable();
+ cpu_relax();
+}
+
+#ifndef CONFIG_XEN
+/*
+ * mwait selection logic:
+ *
+ * It depends on the CPU. For AMD CPUs that support MWAIT this is
+ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
+ * then depend on a clock divisor and current Pstate of the core. If
+ * all cores of a processor are in halt state (C1) the processor can
+ * enter the C1E (C1 enhanced) state. If mwait is used this will never
+ * happen.
+ *
+ * idle=mwait overrides this decision and forces the usage of mwait.
+ */
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+{
+ if (force_mwait)
+ return 1;
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch(c->x86) {
+ case 0x10:
+ case 0x11:
+ return 0;
+ }
+ }
+ return 1;
+}
+#endif
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+#ifndef CONFIG_XEN
+ static int selected;
+
+ if (selected)
+ return;
+#ifdef CONFIG_X86_SMP
+ if (pm_idle == poll_idle && smp_num_siblings > 1) {
+ printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+ " performance may degrade.\n");
+ }
+#endif
+ if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
+ /*
+ * Skip, if setup has overridden idle.
+ * One CPU supports mwait => All CPUs supports mwait
+ */
+ if (!pm_idle) {
+ printk(KERN_INFO "using mwait in idle threads.\n");
+ pm_idle = mwait_idle;
+ }
+ }
+ selected = 1;
+#endif
+}
+
+static int __init idle_setup(char *str)
+{
+ if (!strcmp(str, "poll")) {
+ printk("using polling idle threads.\n");
+ pm_idle = poll_idle;
+ }
+#ifndef CONFIG_XEN
+ else if (!strcmp(str, "mwait"))
+ force_mwait = 1;
+#endif
+ else
+ return -1;
+
+ boot_option_idle_override = 1;
+ return 0;
+}
+early_param("idle", idle_setup);
+
--- /dev/null
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <stdarg.h>
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>
+#include <linux/personality.h>
+#include <linux/tick.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/desc.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/cpu_hotplug.h>
+
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+#include <asm/kdebug.h>
+
+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
+
+static int hlt_counter;
+
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
+/*
+ * Return saved PC of a blocked thread.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+ return ((unsigned long *)tsk->thread.sp)[3];
+}
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+void disable_hlt(void)
+{
+ hlt_counter++;
+}
+
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+ hlt_counter--;
+}
+
+EXPORT_SYMBOL(enable_hlt);
+
+static void xen_idle(void)
+{
+ current_thread_info()->status &= ~TS_POLLING;
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+
+ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+ else
+ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern cpumask_t cpu_initialized;
+static inline void play_dead(void)
+{
+ idle_task_exit();
+ local_irq_disable();
+ cpu_clear(smp_processor_id(), cpu_initialized);
+ preempt_enable_no_resched();
+ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
+ cpu_bringup();
+}
+#else
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+ int cpu = smp_processor_id();
+
+ current_thread_info()->status |= TS_POLLING;
+
+ /* endless idle loop with no priority at all */
+ while (1) {
+ tick_nohz_stop_sched_tick();
+ while (!need_resched()) {
+ void (*idle)(void);
+
+ check_pgt_cache();
+ rmb();
+ idle = xen_idle; /* no alternatives */
+
+ if (rcu_pending(cpu))
+ rcu_check_callbacks(cpu, 0);
+
+ if (cpu_is_offline(cpu))
+ play_dead();
+
+ local_irq_disable();
+ __get_cpu_var(irq_stat).idle_timestamp = jiffies;
+ idle();
+ }
+ tick_nohz_restart_sched_tick();
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+ }
+}
+
+void __show_registers(struct pt_regs *regs, int all)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned long sp;
+ unsigned short ss, gs;
+
+ if (user_mode_vm(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
+ savesegment(gs, gs);
+ } else {
+ sp = (unsigned long) (®s->sp);
+ savesegment(ss, ss);
+ savesegment(gs, gs);
+ }
+
+ printk("\n");
+ printk("Pid: %d, comm: %s %s (%s %.*s)\n",
+ task_pid_nr(current), current->comm,
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+ (u16)regs->cs, regs->ip, regs->flags,
+ smp_processor_id());
+ print_symbol("EIP is at %s\n", regs->ip);
+
+ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ regs->ax, regs->bx, regs->cx, regs->dx);
+ printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ regs->si, regs->di, regs->bp, sp);
+ printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+
+ if (!all)
+ return;
+
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = read_cr3();
+ cr4 = read_cr4_safe();
+ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ cr0, cr2, cr3, cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ d0, d1, d2, d3);
+
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+ printk("DR6: %08lx DR7: %08lx\n",
+ d6, d7);
+}
+
+void show_regs(struct pt_regs *regs)
+{
+ __show_registers(regs, 1);
+ show_trace(NULL, regs, ®s->sp, regs->bp);
+}
+
+/*
+ * This gets run with %bx containing the
+ * function to call, and %dx containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+{
+ struct pt_regs regs;
+
+ memset(®s, 0, sizeof(regs));
+
+ regs.bx = (unsigned long) fn;
+ regs.dx = (unsigned long) arg;
+
+ regs.ds = __USER_DS;
+ regs.es = __USER_DS;
+ regs.fs = __KERNEL_PERCPU;
+ regs.orig_ax = -1;
+ regs.ip = (unsigned long) kernel_thread_helper;
+ regs.cs = __KERNEL_CS | get_kernel_rpl();
+ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+
+ /* Ok, create the new process.. */
+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+ /* The process may have allocated an io port bitmap... nuke it. */
+ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
+ struct task_struct *tsk = current;
+ struct thread_struct *t = &tsk->thread;
+ struct physdev_set_iobitmap set_iobitmap;
+ memset(&set_iobitmap, 0, sizeof(set_iobitmap));
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+ &set_iobitmap));
+ kfree(t->io_bitmap_ptr);
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+ }
+}
+
+void flush_thread(void)
+{
+ struct task_struct *tsk = current;
+
+ tsk->thread.debugreg0 = 0;
+ tsk->thread.debugreg1 = 0;
+ tsk->thread.debugreg2 = 0;
+ tsk->thread.debugreg3 = 0;
+ tsk->thread.debugreg6 = 0;
+ tsk->thread.debugreg7 = 0;
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ clear_tsk_thread_flag(tsk, TIF_DEBUG);
+ /*
+ * Forget coprocessor state..
+ */
+ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+ BUG_ON(dead_task->mm);
+ release_vm86_irqs(dead_task);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+ unlazy_fpu(tsk);
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+ unsigned long unused,
+ struct task_struct * p, struct pt_regs * regs)
+{
+ struct pt_regs * childregs;
+ struct task_struct *tsk;
+ int err;
+
+ childregs = task_pt_regs(p);
+ *childregs = *regs;
+ childregs->ax = 0;
+ childregs->sp = sp;
+
+ p->thread.sp = (unsigned long) childregs;
+ p->thread.sp0 = (unsigned long) (childregs+1);
+
+ p->thread.ip = (unsigned long) ret_from_fork;
+
+ savesegment(gs, p->thread.gs);
+
+ tsk = current;
+ if (test_tsk_thread_flag(tsk, TIF_CSTAR))
+ p->thread.ip = (unsigned long) cstar_ret_from_fork;
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+ p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
+ if (!p->thread.io_bitmap_ptr) {
+ p->thread.io_bitmap_max = 0;
+ return -ENOMEM;
+ }
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+ }
+
+ err = 0;
+
+ /*
+ * Set a new TLS for the child thread?
+ */
+ if (clone_flags & CLONE_SETTLS)
+ err = do_set_thread_area(p, -1,
+ (struct user_desc __user *)childregs->si, 0);
+
+ p->thread.iopl = current->thread.iopl;
+
+ if (err && p->thread.io_bitmap_ptr) {
+ kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+ }
+ return err;
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ __asm__("movl %0, %%gs" :: "r"(0));
+ regs->fs = 0;
+ set_fs(USER_DS);
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ /*
+ * Free the old FP and other extended state
+ */
+ free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+static void hard_disable_TSC(void)
+{
+ write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+#ifdef CONFIG_SECCOMP_DISABLE_TSC
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_disable_TSC();
+ preempt_enable();
+#endif
+}
+
+static void hard_enable_TSC(void)
+{
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_enable_TSC();
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev, *next;
+ unsigned long debugctl;
+
+ prev = &prev_p->thread;
+ next = &next_p->thread;
+
+ debugctl = prev->debugctlmsr;
+ if (next->ds_area_msr != prev->ds_area_msr) {
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
+ }
+
+ if (next->debugctlmsr != debugctl)
+ update_debugctlmsr(next->debugctlmsr);
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ set_debugreg(next->debugreg0, 0);
+ set_debugreg(next->debugreg1, 1);
+ set_debugreg(next->debugreg2, 2);
+ set_debugreg(next->debugreg3, 3);
+ /* no 4 and 5 */
+ set_debugreg(next->debugreg6, 6);
+ set_debugreg(next->debugreg7, 7);
+ }
+
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+ hard_disable_TSC();
+ else
+ hard_enable_TSC();
+ }
+
+#ifdef X86_BTS
+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
+}
+
+/*
+ * switch_to(x,yn) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %ax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev = &prev_p->thread,
+ *next = &next_p->thread;
+ int cpu = smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+#endif
+ struct physdev_set_iopl iopl_op;
+ struct physdev_set_iobitmap iobmp_op;
+ multicall_entry_t _mcl[8], *mcl = _mcl;
+
+ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
+
+ /*
+ * This is basically '__unlazy_fpu', except that we queue a
+ * multicall to indicate FPU task switch, rather than
+ * synchronously trapping to Xen.
+ */
+ if (task_thread_info(prev_p)->status & TS_USEDFPU) {
+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = 1;
+ mcl++;
+ }
+#if 0 /* lazy fpu sanity check */
+ else BUG_ON(!(read_cr0() & 8));
+#endif
+
+ /*
+ * Reload sp0.
+ * This is load_sp0(tss, next) with a multicall.
+ */
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = __KERNEL_DS;
+ mcl->args[1] = next->sp0;
+ mcl++;
+
+ /*
+ * Load the per-thread Thread-Local Storage descriptor.
+ * This is load_TLS(next, cpu) with multicalls.
+ */
+#define C(i) do { \
+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
+ next->tls_array[i].b != prev->tls_array[i].b)) { \
+ mcl->op = __HYPERVISOR_update_descriptor; \
+ *(u64 *)&mcl->args[0] = virt_to_machine( \
+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
+ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
+ mcl++; \
+ } \
+} while (0)
+ C(0); C(1); C(2);
+#undef C
+
+ if (unlikely(prev->iopl != next->iopl)) {
+ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
+ mcl->op = __HYPERVISOR_physdev_op;
+ mcl->args[0] = PHYSDEVOP_set_iopl;
+ mcl->args[1] = (unsigned long)&iopl_op;
+ mcl++;
+ }
+
+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+ set_xen_guest_handle(iobmp_op.bitmap,
+ (char *)next->io_bitmap_ptr);
+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+ mcl->op = __HYPERVISOR_physdev_op;
+ mcl->args[0] = PHYSDEVOP_set_iobitmap;
+ mcl->args[1] = (unsigned long)&iobmp_op;
+ mcl++;
+ }
+
+ BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
+ if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
+ BUG();
+
+ /* we're going to use this soon, after a few expensive things */
+ if (next_p->fpu_counter > 5)
+ prefetch(next->xstate);
+
+ /*
+ * Now maybe handle debug registers
+ */
+ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+ task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+ __switch_to_xtra(prev_p, next_p);
+
+ /*
+ * Leave lazy mode, flushing any hypercalls made here.
+ * This must be done before restoring TLS segments so
+ * the GDT and LDT are properly updated, and must be
+ * done before math_state_restore, so the TS bit is up
+ * to date.
+ */
+ arch_leave_lazy_cpu_mode();
+
+ /* If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
+ *
+ * tsk_used_math() checks prevent calling math_state_restore(),
+ * which can sleep in the case of !tsk_used_math()
+ */
+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
+ math_state_restore();
+
+ /*
+ * Restore %gs if needed (which is common)
+ */
+ if (prev->gs | next->gs)
+ loadsegment(gs, next->gs);
+
+ x86_write_percpu(current_task, next_p);
+
+ return prev_p;
+}
+
+asmlinkage int sys_fork(struct pt_regs regs)
+{
+ return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
+}
+
+asmlinkage int sys_clone(struct pt_regs regs)
+{
+ unsigned long clone_flags;
+ unsigned long newsp;
+ int __user *parent_tidptr, *child_tidptr;
+
+ clone_flags = regs.bx;
+ newsp = regs.cx;
+ parent_tidptr = (int __user *)regs.dx;
+ child_tidptr = (int __user *)regs.di;
+ if (!newsp)
+ newsp = regs.sp;
+ return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage int sys_vfork(struct pt_regs regs)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage int sys_execve(struct pt_regs regs)
+{
+ int error;
+ char * filename;
+
+ filename = getname((char __user *) regs.bx);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ goto out;
+ error = do_execve(filename,
+ (char __user * __user *) regs.cx,
+ (char __user * __user *) regs.dx,
+ ®s);
+ if (error == 0) {
+ /* Make sure we don't return using sysenter.. */
+ set_thread_flag(TIF_IRET);
+ }
+ putname(filename);
+out:
+ return error;
+}
+
+#define top_esp (THREAD_SIZE - sizeof(unsigned long))
+#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long bp, sp, ip;
+ unsigned long stack_page;
+ int count = 0;
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
+ stack_page = (unsigned long)task_stack_page(p);
+ sp = p->thread.sp;
+ if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
+ return 0;
+ /* include/asm-i386/system.h:switch_to() pushes bp last. */
+ bp = *(unsigned long *) sp;
+ do {
+ if (bp < stack_page || bp > top_ebp+stack_page)
+ return 0;
+ ip = *(unsigned long *) (bp+4);
+ if (!in_sched_functions(ip))
+ return ip;
+ bp = *(unsigned long *) bp;
+ } while (count++ < 16);
+ return 0;
+}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ unsigned long range_end = mm->brk + 0x02000000;
+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
--- /dev/null
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * X86-64 port
+ * Andi Kleen.
+ *
+ * CPU hotplug support - ashok.raj@intel.com
+ *
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Modified for Xen
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <stdarg.h>
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/tick.h>
+#include <linux/prctl.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/mmu_context.h>
+#include <asm/pda.h>
+#include <asm/prctl.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/hardirq.h>
+#include <asm/ia32.h>
+#include <asm/idle.h>
+
+#include <xen/cpu_hotplug.h>
+
+asmlinkage extern void ret_from_fork(void);
+
+unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+
+void enter_idle(void)
+{
+ write_pda(isidle, 1);
+ atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
+
+static void __exit_idle(void)
+{
+ if (test_and_clear_bit_pda(0, isidle) == 0)
+ return;
+ atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+ /* idle loop has pid 0 */
+ if (current->pid)
+ return;
+ __exit_idle();
+}
+
+static void xen_idle(void)
+{
+ current_thread_info()->status &= ~TS_POLLING;
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+ else
+ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static inline void play_dead(void)
+{
+ idle_task_exit();
+ local_irq_disable();
+ cpu_clear(smp_processor_id(), cpu_initialized);
+ preempt_enable_no_resched();
+ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
+ cpu_bringup();
+}
+#else
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+ current_thread_info()->status |= TS_POLLING;
+ /* endless idle loop with no priority at all */
+ while (1) {
+ tick_nohz_stop_sched_tick();
+ while (!need_resched()) {
+ void (*idle)(void);
+
+ rmb();
+ idle = xen_idle; /* no alternatives */
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
+ /*
+ * Idle routines should keep interrupts disabled
+ * from here on, until they go to idle.
+ * Otherwise, idle callbacks can misfire.
+ */
+ local_irq_disable();
+ enter_idle();
+ idle();
+ /* In many cases the interrupt that ended idle
+ has already called exit_idle. But some idle
+ loops can be woken up without interrupt. */
+ __exit_idle();
+ }
+
+ tick_nohz_restart_sched_tick();
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+ }
+}
+
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs * regs)
+{
+ unsigned long fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned int fsindex, gsindex;
+ unsigned int ds, cs, es;
+
+ printk("\n");
+ print_modules();
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+ printk_address(regs->ip, 1);
+ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
+ regs->flags);
+ printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ regs->ax, regs->bx, regs->cx);
+ printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ regs->dx, regs->si, regs->di);
+ printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+ regs->bp, regs->r8, regs->r9);
+ printk("R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk("R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ asm("mov %%ds,%0" : "=r" (ds));
+ asm("mov %%cs,%0" : "=r" (cs));
+ asm("mov %%es,%0" : "=r" (es));
+ asm("mov %%fs,%0" : "=r" (fsindex));
+ asm("mov %%gs,%0" : "=r" (gsindex));
+
+ rdmsrl(MSR_FS_BASE, fs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+ printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs,fsindex,gs,gsindex,shadowgs);
+ printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+ printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+}
+
+void show_regs(struct pt_regs *regs)
+{
+ printk("CPU %d:", smp_processor_id());
+ __show_regs(regs);
+ show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+ struct task_struct *me = current;
+ struct thread_struct *t = &me->thread;
+
+ if (me->thread.io_bitmap_ptr) {
+#ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+#endif
+#ifdef CONFIG_XEN
+ struct physdev_set_iobitmap iobmp_op;
+ memset(&iobmp_op, 0, sizeof(iobmp_op));
+#endif
+
+ kfree(t->io_bitmap_ptr);
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+ /*
+ * Careful, clear this in the TSS too:
+ */
+#ifndef CONFIG_X86_NO_TSS
+ memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+ put_cpu();
+#endif
+#ifdef CONFIG_XEN
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+ &iobmp_op));
+#endif
+ t->io_bitmap_max = 0;
+ }
+}
+
+void load_gs_index(unsigned gs)
+{
+ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
+}
+
+void flush_thread(void)
+{
+ struct task_struct *tsk = current;
+
+ if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
+ clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
+ if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+ clear_tsk_thread_flag(tsk, TIF_IA32);
+ } else {
+ set_tsk_thread_flag(tsk, TIF_IA32);
+ current_thread_info()->status |= TS_COMPAT;
+ }
+ }
+ clear_tsk_thread_flag(tsk, TIF_DEBUG);
+
+ tsk->thread.debugreg0 = 0;
+ tsk->thread.debugreg1 = 0;
+ tsk->thread.debugreg2 = 0;
+ tsk->thread.debugreg3 = 0;
+ tsk->thread.debugreg6 = 0;
+ tsk->thread.debugreg7 = 0;
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ /*
+ * Forget coprocessor state..
+ */
+ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+ if (dead_task->mm) {
+ if (dead_task->mm->context.size) {
+ printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+ dead_task->comm,
+ dead_task->mm->context.ldt,
+ dead_task->mm->context.size);
+ BUG();
+ }
+ }
+}
+
+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+{
+ struct user_desc ud = {
+ .base_addr = addr,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .limit_in_pages = 1,
+ .useable = 1,
+ };
+ struct desc_struct *desc = t->thread.tls_array;
+ desc += tls;
+ fill_ldt(desc, &ud);
+}
+
+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+{
+ return get_desc_base(&t->thread.tls_array[tls]);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+ unlazy_fpu(tsk);
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+ unsigned long unused,
+ struct task_struct * p, struct pt_regs * regs)
+{
+ int err;
+ struct pt_regs * childregs;
+ struct task_struct *me = current;
+
+ childregs = ((struct pt_regs *)
+ (THREAD_SIZE + task_stack_page(p))) - 1;
+ *childregs = *regs;
+
+ childregs->ax = 0;
+ childregs->sp = sp;
+ if (sp == ~0UL)
+ childregs->sp = (unsigned long)childregs;
+
+ p->thread.sp = (unsigned long) childregs;
+ p->thread.sp0 = (unsigned long) (childregs+1);
+ p->thread.usersp = me->thread.usersp;
+
+ set_tsk_thread_flag(p, TIF_FORK);
+
+ p->thread.fs = me->thread.fs;
+ p->thread.gs = me->thread.gs;
+
+ asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
+ asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
+ asm("mov %%es,%0" : "=m" (p->thread.es));
+ asm("mov %%ds,%0" : "=m" (p->thread.ds));
+
+ if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+ if (!p->thread.io_bitmap_ptr) {
+ p->thread.io_bitmap_max = 0;
+ return -ENOMEM;
+ }
+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES);
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+ }
+
+ /*
+ * Set a new TLS for the child thread?
+ */
+ if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+ err = do_set_thread_area(p, -1,
+ (struct user_desc __user *)childregs->si, 0);
+ else
+#endif
+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ if (err)
+ goto out;
+ }
+ p->thread.iopl = current->thread.iopl;
+
+ err = 0;
+out:
+ if (err && p->thread.io_bitmap_ptr) {
+ kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+ }
+ return err;
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+ load_gs_index(0);
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ write_pda(oldrsp, new_sp);
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+ regs->flags = 0x200;
+ set_fs(USER_DS);
+ /*
+ * Free the old FP and other extended state
+ */
+ free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+static void hard_disable_TSC(void)
+{
+ write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+#ifdef CONFIG_SECCOMP_DISABLE_TSC
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_disable_TSC();
+ preempt_enable();
+#endif
+}
+
+static void hard_enable_TSC(void)
+{
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_enable_TSC();
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * This special macro can be used to load a debugging register
+ */
+#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
+
+static inline void __switch_to_xtra(struct task_struct *prev_p,
+ struct task_struct *next_p)
+{
+ struct thread_struct *prev, *next;
+ unsigned long debugctl;
+
+ prev = &prev_p->thread,
+ next = &next_p->thread;
+
+ debugctl = prev->debugctlmsr;
+ if (next->ds_area_msr != prev->ds_area_msr) {
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+ }
+
+ if (next->debugctlmsr != debugctl)
+ update_debugctlmsr(next->debugctlmsr);
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ loaddebug(next, 0);
+ loaddebug(next, 1);
+ loaddebug(next, 2);
+ loaddebug(next, 3);
+ /* no 4 and 5 */
+ loaddebug(next, 6);
+ loaddebug(next, 7);
+ }
+
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+ hard_disable_TSC();
+ else
+ hard_enable_TSC();
+ }
+
+#ifdef X86_BTS
+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
+}
+
+/*
+ * switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized:
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
+ */
+struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev = &prev_p->thread,
+ *next = &next_p->thread;
+ int cpu = smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+#endif
+ struct physdev_set_iopl iopl_op;
+ struct physdev_set_iobitmap iobmp_op;
+ multicall_entry_t _mcl[8], *mcl = _mcl;
+
+ /* we're going to use this soon, after a few expensive things */
+ if (next_p->fpu_counter>5)
+ prefetch(next->xstate);
+
+ /*
+ * This is basically '__unlazy_fpu', except that we queue a
+ * multicall to indicate FPU task switch, rather than
+ * synchronously trapping to Xen.
+ * The AMD workaround requires it to be after DS reload, or
+ * after DS has been cleared, which we do in __prepare_arch_switch.
+ */
+ if (task_thread_info(prev_p)->status & TS_USEDFPU) {
+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = 1;
+ mcl++;
+ } else
+ prev_p->fpu_counter = 0;
+
+ /*
+ * Reload sp0.
+ * This is load_sp0(tss, next) with a multicall.
+ */
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = __KERNEL_DS;
+ mcl->args[1] = next->sp0;
+ mcl++;
+
+ /*
+ * Load the per-thread Thread-Local Storage descriptor.
+ * This is load_TLS(next, cpu) with multicalls.
+ */
+#define C(i) do { \
+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
+ next->tls_array[i].b != prev->tls_array[i].b)) { \
+ mcl->op = __HYPERVISOR_update_descriptor; \
+ mcl->args[0] = virt_to_machine( \
+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
+ mcl->args[1] = *(u64 *)&next->tls_array[i]; \
+ mcl++; \
+ } \
+} while (0)
+ C(0); C(1); C(2);
+#undef C
+
+ if (unlikely(prev->iopl != next->iopl)) {
+ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
+ mcl->op = __HYPERVISOR_physdev_op;
+ mcl->args[0] = PHYSDEVOP_set_iopl;
+ mcl->args[1] = (unsigned long)&iopl_op;
+ mcl++;
+ }
+
+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+ set_xen_guest_handle(iobmp_op.bitmap,
+ (char *)next->io_bitmap_ptr);
+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+ mcl->op = __HYPERVISOR_physdev_op;
+ mcl->args[0] = PHYSDEVOP_set_iobitmap;
+ mcl->args[1] = (unsigned long)&iobmp_op;
+ mcl++;
+ }
+
+ BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
+ if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
+ BUG();
+
+ /*
+ * Switch DS and ES.
+ * This won't pick up thread selector changes, but I guess that is ok.
+ */
+ if (unlikely(next->es))
+ loadsegment(es, next->es);
+
+ if (unlikely(next->ds))
+ loadsegment(ds, next->ds);
+
+ /*
+ * Switch FS and GS.
+ */
+ if (unlikely(next->fsindex))
+ loadsegment(fs, next->fsindex);
+
+ if (next->fs)
+ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
+
+ if (unlikely(next->gsindex))
+ load_gs_index(next->gsindex);
+
+ if (next->gs)
+ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
+
+ /*
+ * Switch the PDA context.
+ */
+ prev->usersp = read_pda(oldrsp);
+ write_pda(oldrsp, next->usersp);
+ write_pda(pcurrent, next_p);
+ write_pda(kernelstack,
+ (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ write_pda(stack_canary, next_p->stack_canary);
+
+ /*
+ * Build time only check to make sure the stack_canary is at
+ * offset 40 in the pda; this is a gcc ABI requirement
+ */
+ BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+#endif
+
+ /*
+ * Now maybe reload the debug registers
+ */
+ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
+ task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev_p, next_p);
+
+ /* If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
+ *
+ * tsk_used_math() checks prevent calling math_state_restore(),
+ * which can sleep in the case of !tsk_used_math()
+ */
+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
+ math_state_restore();
+ return prev_p;
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage
+long sys_execve(char __user *name, char __user * __user *argv,
+ char __user * __user *envp, struct pt_regs *regs)
+{
+ long error;
+ char * filename;
+
+ filename = getname(name);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ return error;
+ error = do_execve(filename, argv, envp, regs);
+ putname(filename);
+ return error;
+}
+
+void set_personality_64bit(void)
+{
+ /* inherit personality from parent */
+
+ /* Make sure to be in 64bit mode */
+ clear_thread_flag(TIF_IA32);
+
+ /* TBD: overwrites user setup. Should have two bits.
+ But 64bit processes have always behaved this way,
+ so it's not too bad. The main problem is just that
+ 32bit childs are affected again. */
+ current->personality &= ~READ_IMPLIES_EXEC;
+}
+
+asmlinkage long sys_fork(struct pt_regs *regs)
+{
+ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+ if (!newsp)
+ newsp = regs->sp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage long sys_vfork(struct pt_regs *regs)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+ NULL, NULL);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long stack;
+ u64 fp,ip;
+ int count = 0;
+
+ if (!p || p == current || p->state==TASK_RUNNING)
+ return 0;
+ stack = (unsigned long)task_stack_page(p);
+ if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
+ return 0;
+ fp = *(u64 *)(p->thread.sp);
+ do {
+ if (fp < (unsigned long)stack ||
+ fp > (unsigned long)stack+THREAD_SIZE)
+ return 0;
+ ip = *(u64 *)(fp+8);
+ if (!in_sched_functions(ip))
+ return ip;
+ fp = *(u64 *)fp;
+ } while (count++ < 16);
+ return 0;
+}
+
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+{
+ int ret = 0;
+ int doit = task == current;
+ int cpu;
+
+ switch (code) {
+ case ARCH_SET_GS:
+ if (addr >= TASK_SIZE_OF(task))
+ return -EPERM;
+ cpu = get_cpu();
+ /* handle small bases via the GDT because that's faster to
+ switch. */
+ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, GS_TLS, addr);
+ if (doit) {
+ load_TLS(&task->thread, cpu);
+ load_gs_index(GS_TLS_SEL);
+ }
+ task->thread.gsindex = GS_TLS_SEL;
+ task->thread.gs = 0;
+ } else {
+ task->thread.gsindex = 0;
+ task->thread.gs = addr;
+ if (doit) {
+ load_gs_index(0);
+ ret = HYPERVISOR_set_segment_base(
+ SEGBASE_GS_USER, addr);
+ }
+ }
+ put_cpu();
+ break;
+ case ARCH_SET_FS:
+ /* Not strictly needed for fs, but do it for symmetry
+ with gs */
+ if (addr >= TASK_SIZE_OF(task))
+ return -EPERM;
+ cpu = get_cpu();
+ /* handle small bases via the GDT because that's faster to
+ switch. */
+ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, FS_TLS, addr);
+ if (doit) {
+ load_TLS(&task->thread, cpu);
+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+ }
+ task->thread.fsindex = FS_TLS_SEL;
+ task->thread.fs = 0;
+ } else {
+ task->thread.fsindex = 0;
+ task->thread.fs = addr;
+ if (doit) {
+ /* set the selector to 0 to not confuse
+ __switch_to */
+ asm volatile("movl %0,%%fs" :: "r" (0));
+ ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
+ addr);
+ }
+ }
+ put_cpu();
+ break;
+ case ARCH_GET_FS: {
+ unsigned long base;
+ if (task->thread.fsindex == FS_TLS_SEL)
+ base = read_32bit_tls(task, FS_TLS);
+ else if (doit)
+ rdmsrl(MSR_FS_BASE, base);
+ else
+ base = task->thread.fs;
+ ret = put_user(base, (unsigned long __user *)addr);
+ break;
+ }
+ case ARCH_GET_GS: {
+ unsigned long base;
+ unsigned gsindex;
+ if (task->thread.gsindex == GS_TLS_SEL)
+ base = read_32bit_tls(task, GS_TLS);
+ else if (doit) {
+ asm("movl %%gs,%0" : "=r" (gsindex));
+ if (gsindex)
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ base = task->thread.gs;
+ }
+ else
+ base = task->thread.gs;
+ ret = put_user(base, (unsigned long __user *)addr);
+ break;
+ }
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+long sys_arch_prctl(int code, unsigned long addr)
+{
+ return do_arch_prctl(current, code, addr);
+}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ unsigned long range_end = mm->brk + 0x02000000;
+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
--- /dev/null
+/*
+ * This file contains work-arounds for x86 and x86_64 platform bugs.
+ */
+#include <linux/pci.h>
+#include <linux/irq.h>
+
+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
+
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+{
+ u8 config, rev;
+ u16 word;
+
+ /* BIOS may enable hardware IRQ balancing for
+ * E7520/E7320/E7525(revision ID 0x9 and below)
+ * based platforms.
+ * Disable SW irqbalance/affinity on those platforms.
+ */
+ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+ if (rev > 0x9)
+ return;
+
+ /* enable access to config space*/
+ pci_read_config_byte(dev, 0xf4, &config);
+ pci_write_config_byte(dev, 0xf4, config|0x2);
+
+ /*
+ * read xTPR register. We may not have a pci_dev for device 8
+ * because it might be hidden until the above write.
+ */
+ pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
+
+ if (!(word & (1 << 13))) {
+ struct xen_platform_op op;
+
+ dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
+ "disabling irq balancing and affinity\n");
+ op.cmd = XENPF_platform_quirk;
+ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
+ WARN_ON(HYPERVISOR_platform_op(&op));
+ }
+
+ /* put back the original value for config space*/
+ if (!(config & 0x2))
+ pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH,
+ quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH,
+ quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH,
+ quirk_intel_irqbalance);
+#endif
+
+#if defined(CONFIG_HPET_TIMER)
+#include <asm/hpet.h>
+
+unsigned long force_hpet_address;
+
+static enum {
+ NONE_FORCE_HPET_RESUME,
+ OLD_ICH_FORCE_HPET_RESUME,
+ ICH_FORCE_HPET_RESUME,
+ VT8237_FORCE_HPET_RESUME,
+ NVIDIA_FORCE_HPET_RESUME,
+} force_hpet_resume_type;
+
+static void __iomem *rcba_base;
+
+static void ich_force_hpet_resume(void)
+{
+ u32 val;
+
+ if (!force_hpet_address)
+ return;
+
+ if (rcba_base == NULL)
+ BUG();
+
+ /* read the Function Disable register, dword mode only */
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80)) {
+ /* HPET disabled in HPTC. Trying to enable */
+ writel(val | 0x80, rcba_base + 0x3404);
+ }
+
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80))
+ BUG();
+ else
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+
+ return;
+}
+
+static void ich_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 val;
+ u32 uninitialized_var(rcba);
+ int err = 0;
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ pci_read_config_dword(dev, 0xF0, &rcba);
+ rcba &= 0xFFFFC000;
+ if (rcba == 0) {
+ dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
+ "cannot force enable HPET\n");
+ return;
+ }
+
+ /* use bits 31:14, 16 kB aligned */
+ rcba_base = ioremap_nocache(rcba, 0x4000);
+ if (rcba_base == NULL) {
+ dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
+ "cannot force enable HPET\n");
+ return;
+ }
+
+ /* read the Function Disable register, dword mode only */
+ val = readl(rcba_base + 0x3404);
+
+ if (val & 0x80) {
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val = val & 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ iounmap(rcba_base);
+ return;
+ }
+
+ /* HPET disabled in HPTC. Trying to enable */
+ writel(val | 0x80, rcba_base + 0x3404);
+
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80)) {
+ err = 1;
+ } else {
+ val = val & 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ }
+
+ if (err) {
+ force_hpet_address = 0;
+ iounmap(rcba_base);
+ dev_printk(KERN_DEBUG, &dev->dev,
+ "Failed to force enable HPET\n");
+ } else {
+ force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ }
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
+ ich_force_enable_hpet);
+
+
+static struct pci_dev *cached_dev;
+
+static void old_ich_force_hpet_resume(void)
+{
+ u32 val;
+ u32 uninitialized_var(gen_cntl);
+
+ if (!force_hpet_address || !cached_dev)
+ return;
+
+ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl);
+ gen_cntl &= (~(0x7 << 15));
+ gen_cntl |= (0x4 << 15);
+
+ pci_write_config_dword(cached_dev, 0xD0, gen_cntl);
+ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl);
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val == 0x4)
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+ else
+ BUG();
+}
+
+static void old_ich_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 val;
+ u32 uninitialized_var(gen_cntl);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ pci_read_config_dword(dev, 0xD0, &gen_cntl);
+ /*
+ * Bit 17 is HPET enable bit.
+ * Bit 16:15 control the HPET base address.
+ */
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val & 0x4) {
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+ force_hpet_address);
+ return;
+ }
+
+ /*
+ * HPET is disabled. Trying enabling at FED00000 and check
+ * whether it sticks
+ */
+ gen_cntl &= (~(0x7 << 15));
+ gen_cntl |= (0x4 << 15);
+ pci_write_config_dword(dev, 0xD0, gen_cntl);
+
+ pci_read_config_dword(dev, 0xD0, &gen_cntl);
+
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val & 0x4) {
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
+ return;
+ }
+
+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+}
+
+/*
+ * Undocumented chipset features. Make sure that the user enforced
+ * this.
+ */
+static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
+{
+ if (hpet_force_user)
+ old_ich_force_enable_hpet(dev);
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0,
+ old_ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12,
+ old_ich_force_enable_hpet);
+
+
+static void vt8237_force_hpet_resume(void)
+{
+ u32 val;
+
+ if (!force_hpet_address || !cached_dev)
+ return;
+
+ val = 0xfed00000 | 0x80;
+ pci_write_config_dword(cached_dev, 0x68, val);
+
+ pci_read_config_dword(cached_dev, 0x68, &val);
+ if (val & 0x80)
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+ else
+ BUG();
+}
+
+static void vt8237_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (!hpet_force_user || hpet_address || force_hpet_address)
+ return;
+
+ pci_read_config_dword(dev, 0x68, &val);
+ /*
+ * Bit 7 is HPET enable bit.
+ * Bit 31:10 is HPET base address (contrary to what datasheet claims)
+ */
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+ force_hpet_address);
+ return;
+ }
+
+ /*
+ * HPET is disabled. Trying enabling at FED00000 and check
+ * whether it sticks
+ */
+ val = 0xfed00000 | 0x80;
+ pci_write_config_dword(dev, 0x68, val);
+
+ pci_read_config_dword(dev, 0x68, &val);
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
+ return;
+ }
+
+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
+ vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
+ vt8237_force_enable_hpet);
+
+/*
+ * Undocumented chipset feature taken from LinuxBIOS.
+ */
+static void nvidia_force_hpet_resume(void)
+{
+ pci_write_config_dword(cached_dev, 0x44, 0xfed00001);
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static void nvidia_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (!hpet_force_user || hpet_address || force_hpet_address)
+ return;
+
+ pci_write_config_dword(dev, 0x44, 0xfed00001);
+ pci_read_config_dword(dev, 0x44, &val);
+ force_hpet_address = val & 0xfffffffe;
+ force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+ return;
+}
+
+/* ISA Bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051,
+ nvidia_force_enable_hpet);
+
+/* LPC bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367,
+ nvidia_force_enable_hpet);
+
+void force_hpet_resume(void)
+{
+ switch (force_hpet_resume_type) {
+ case ICH_FORCE_HPET_RESUME:
+ ich_force_hpet_resume();
+ return;
+ case OLD_ICH_FORCE_HPET_RESUME:
+ old_ich_force_hpet_resume();
+ return;
+ case VT8237_FORCE_HPET_RESUME:
+ vt8237_force_hpet_resume();
+ return;
+ case NVIDIA_FORCE_HPET_RESUME:
+ nvidia_force_hpet_resume();
+ return;
+ default:
+ break;
+ }
+}
+
+#endif
movl PTR(PA_PGD)(%ebp), %eax
movl %eax, %cr3
+ /* setup idt */
+ movl %edi, %eax
+ addl $(idt_48 - relocate_kernel), %eax
+ lidtl (%eax)
+
+ /* setup gdt */
+ movl %edi, %eax
+ addl $(gdt - relocate_kernel), %eax
+ movl %edi, %esi
+ addl $((gdt_48 - relocate_kernel) + 2), %esi
+ movl %eax, (%esi)
+
+ movl %edi, %eax
+ addl $(gdt_48 - relocate_kernel), %eax
+ lgdtl (%eax)
+
+ /* setup data segment registers */
+ mov $(gdt_ds - gdt), %eax
+ mov %eax, %ds
+ mov %eax, %es
+ mov %eax, %fs
+ mov %eax, %gs
+ mov %eax, %ss
+
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%edi), %esp
- /* jump to identity mapped page */
- movl %edi, %eax
- addl $(identity_mapped - relocate_kernel), %eax
- pushl %eax
- ret
+ /* load new code segment and jump to identity mapped page */
+ movl %edi, %esi
+ xorl %eax, %eax
+ pushl %eax
+ pushl %esi
+ pushl %eax
+ movl $(gdt_cs - gdt), %eax
+ pushl %eax
+ movl %edi, %eax
+ addl $(identity_mapped - relocate_kernel),%eax
+ pushl %eax
+ iretl
identity_mapped:
/* store the start address on the stack */
xorl %edi, %edi
xorl %ebp, %ebp
ret
+
+ .align 16
+gdt:
+ .quad 0x0000000000000000 /* NULL descriptor */
+gdt_cs:
+ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
+gdt_ds:
+ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+ .word gdt_end - gdt - 1 /* limit */
+ .long 0 /* base - filled in by code above */
+
+idt_48:
+ .word 0 /* limit */
+ .long 0 /* base */
movq PTR(PA_PGD)(%rsi), %r9
movq %r9, %cr3
+ /* setup idt */
+ movq %r8, %rax
+ addq $(idt_80 - relocate_kernel), %rax
+ lidtq (%rax)
+
+ /* setup gdt */
+ movq %r8, %rax
+ addq $(gdt - relocate_kernel), %rax
+ movq %r8, %r9
+ addq $((gdt_80 - relocate_kernel) + 2), %r9
+ movq %rax, (%r9)
+
+ movq %r8, %rax
+ addq $(gdt_80 - relocate_kernel), %rax
+ lgdtq (%rax)
+
+ /* setup data segment registers */
+ xorl %eax, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
+
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%r8), %rsp
- /* jump to identity mapped page */
- addq $(identity_mapped - relocate_kernel), %r8
- pushq %r8
- ret
+ /* load new code segment and jump to identity mapped page */
+ movq %r8, %rax
+ addq $(identity_mapped - relocate_kernel), %rax
+ pushq $(gdt_cs - gdt)
+ pushq %rax
+ lretq
identity_mapped:
/* store the start address on the stack */
xorq %r13, %r13
xorq %r14, %r14
xorq %r15, %r15
-
ret
+
+ .align 16
+gdt:
+ .quad 0x0000000000000000 /* NULL descriptor */
+gdt_cs:
+ .quad 0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+ .word gdt_end - gdt - 1 /* limit */
+ .quad 0 /* base - filled in by code above */
+
+idt_80:
+ .word 0 /* limit */
+ .quad 0 /* base */
{
unsigned long retval, flags;
+#ifdef CONFIG_XEN
+ if (!is_initial_xendomain())
+ return xen_read_persistent_clock();
+#endif
spin_lock_irqsave(&rtc_lock, flags);
retval = get_wallclock();
spin_unlock_irqrestore(&rtc_lock, flags);
int update_persistent_clock(struct timespec now)
{
+#ifdef CONFIG_XEN
+ if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
+ return 0;
+#endif
return set_rtc_mmss(now.tv_sec);
}
--- /dev/null
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/percpu.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/topology.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int num_processors;
+unsigned disabled_cpus __cpuinitdata;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL(boot_cpu_physical_apicid);
+
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+#endif
+
+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+/*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas. These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ */
+static void __init setup_per_cpu_maps(void)
+{
+#ifndef CONFIG_XEN
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ x86_bios_cpu_apicid_init[cpu];
+#ifdef CONFIG_NUMA
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ x86_cpu_to_node_map_init[cpu];
+#endif
+ }
+
+ /* indicate the early static arrays will soon be gone */
+ x86_cpu_to_apicid_early_ptr = NULL;
+ x86_bios_cpu_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
+ x86_cpu_to_node_map_early_ptr = NULL;
+#endif
+#endif
+}
+
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+cpumask_t *cpumask_of_cpu_map __read_mostly;
+EXPORT_SYMBOL(cpumask_of_cpu_map);
+
+/* requires nr_cpu_ids to be initialized */
+static void __init setup_cpumask_of_cpu(void)
+{
+ int i;
+
+ /* alloc_bootmem zeroes memory */
+ cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+ for (i = 0; i < nr_cpu_ids; i++)
+ cpu_set(i, cpumask_of_cpu_map[i]);
+}
+#else
+static inline void setup_cpumask_of_cpu(void) { }
+#endif
+
+#ifdef CONFIG_X86_32
+/*
+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
+ * the same way
+ */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+#endif
+
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{
+ int i, highest_cpu = 0;
+ unsigned long size;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ prefill_possible_map();
+#endif
+
+ /* Copy section for each CPU (we discard the original) */
+ size = PERCPU_ENOUGH_ROOM;
+ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
+ size);
+
+ for_each_possible_cpu(i) {
+ char *ptr;
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ ptr = alloc_bootmem_pages(size);
+#else
+ int node = early_cpu_to_node(i);
+ if (!node_online(node) || !NODE_DATA(node)) {
+ ptr = alloc_bootmem_pages(size);
+ printk(KERN_INFO
+ "cpu %d has no node or node-local memory\n", i);
+ }
+ else
+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+#endif
+ if (!ptr)
+ panic("Cannot allocate cpu data for CPU %d\n", i);
+#ifdef CONFIG_X86_64
+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+#else
+ __per_cpu_offset[i] = ptr - __per_cpu_start;
+#endif
+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+
+ highest_cpu = i;
+ }
+
+ nr_cpu_ids = highest_cpu + 1;
+ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+
+ /* Setup percpu data maps */
+ setup_per_cpu_maps();
+
+ /* Setup cpumask_of_cpu map */
+ setup_cpumask_of_cpu();
+}
+
+#endif
--- /dev/null
+/*
+ * X86-64 specific CPU setup.
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
+ * See setup.c for older changelog.
+ *
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Modified for Xen
+ *
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/mmu_context.h>
+#include <asm/smp.h>
+#include <asm/i387.h>
+#include <asm/percpu.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/genapic.h>
+#ifdef CONFIG_XEN
+#include <asm/hypervisor.h>
+#endif
+
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
+
+#ifndef CONFIG_X86_NO_IDT
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+#endif
+
+char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
+
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata = 0;
+
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+
+on Enable(default)
+off Disable
+*/
+static int __init nonx_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ do_not_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ do_not_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", nonx_setup);
+
+int force_personality32 = 0;
+
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+
+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off PROT_READ implies PROT_EXEC
+*/
+static int __init nonx32_setup(char *str)
+{
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+#ifdef CONFIG_XEN
+static void __init_refok switch_pt(int cpu)
+{
+ if (cpu == 0)
+ xen_init_pt();
+ xen_pt_switch(__pa_symbol(init_level4_pgt));
+ xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
+}
+#define switch_pt() switch_pt(cpu)
+
+static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
+{
+ unsigned long frames[16];
+ unsigned long va;
+ int f;
+
+ for (va = gdt_descr->address, f = 0;
+ va < gdt_descr->address + gdt_descr->size;
+ va += PAGE_SIZE, f++) {
+ frames[f] = virt_to_mfn(va);
+ make_page_readonly(
+ (void *)va, XENFEAT_writable_descriptor_tables);
+ }
+ if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
+ sizeof (struct desc_struct)))
+ BUG();
+}
+#else
+static void switch_pt(void)
+{
+ asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+}
+
+static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
+{
+ load_gdt(gdt_descr);
+ load_idt(idt_descr);
+}
+#endif
+
+void pda_init(int cpu)
+{
+ struct x8664_pda *pda = cpu_pda(cpu);
+
+ /* Setup up data that may be needed in __get_free_pages early */
+ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
+#ifndef CONFIG_XEN
+ /* Memory clobbers used to order PDA accessed */
+ mb();
+ wrmsrl(MSR_GS_BASE, pda);
+ mb();
+#else
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
+ (unsigned long)pda))
+ BUG();
+#endif
+ pda->cpunumber = cpu;
+ pda->irqcount = -1;
+ pda->kernelstack =
+ (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
+ pda->active_mm = &init_mm;
+ pda->mmu_state = 0;
+
+ if (cpu == 0) {
+ /* others are initialized in smpboot.c */
+ pda->pcurrent = &init_task;
+ pda->irqstackptr = boot_cpu_stack;
+ } else {
+ pda->irqstackptr = (char *)
+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+ if (!pda->irqstackptr)
+ panic("cannot allocate irqstack for cpu %d", cpu);
+ }
+
+ switch_pt();
+
+ pda->irqstackptr += IRQSTACKSIZE-64;
+}
+
+#ifndef CONFIG_X86_NO_TSS
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
+__attribute__((section(".bss.page_aligned")));
+#endif
+
+extern asmlinkage void ignore_sysret(void);
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+#ifndef CONFIG_XEN
+ /*
+ * LSTAR and STAR live in a bit strange symbiosis.
+ * They both write to the same internal register. STAR allows to set CS/DS
+ * but only a 32bit target. LSTAR sets the 64bit rip.
+ */
+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
+ wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+
+ /* Flags to clear on syscall */
+ wrmsrl(MSR_SYSCALL_MASK,
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ syscall32_cpu_init ();
+#else
+ {
+ static const struct callback_register cstar = {
+ .type = CALLBACKTYPE_syscall32,
+ .address = (unsigned long)ignore_sysret
+ };
+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
+ printk(KERN_WARN "Unable to register CSTAR callback\n");
+ }
+#endif
+}
+
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || do_not_nx) {
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+}
+
+unsigned long kernel_eflags;
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __cpuinit cpu_init (void)
+{
+ int cpu = stack_smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+ unsigned long v;
+ char *estacks = NULL;
+ unsigned i;
+#endif
+ struct task_struct *me;
+
+ /* CPU 0 is initialised in head64.c */
+ if (cpu != 0) {
+ pda_init(cpu);
+ }
+#ifndef CONFIG_X86_NO_TSS
+ else
+ estacks = boot_exception_stacks;
+#endif
+
+ me = current;
+
+ if (cpu_test_and_set(cpu, cpu_initialized))
+ panic("CPU#%d already initialized!\n", cpu);
+
+ printk("Initializing CPU#%d\n", cpu);
+
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+#ifndef CONFIG_XEN
+ if (cpu)
+ memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
+#endif
+
+ cpu_gdt_descr[cpu].size = GDT_SIZE;
+ cpu_gdt_init(&cpu_gdt_descr[cpu]);
+
+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
+
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ check_efer();
+
+#ifndef CONFIG_X86_NO_TSS
+ /*
+ * set up and load the per-CPU TSS
+ */
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ static const unsigned int order[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ };
+ if (cpu) {
+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+ if (!estacks)
+ panic("Cannot allocate exception stack %ld %d\n",
+ v, cpu);
+ }
+ estacks += PAGE_SIZE << order[v];
+ orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
+ }
+
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+ */
+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
+ t->io_bitmap[i] = ~0UL;
+#endif
+
+ atomic_inc(&init_mm.mm_count);
+ me->active_mm = &init_mm;
+ if (me->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, me);
+
+#ifndef CONFIG_X86_NO_TSS
+ set_tss_desc(cpu, t);
+#endif
+#ifndef CONFIG_XEN
+ load_TR_desc();
+#endif
+ load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+ /*
+ * If the kgdb is connected no debug regs should be altered. This
+ * is only applicable when KGDB and a KGDB I/O module are built
+ * into the kernel and you are using early debugging with
+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+ */
+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ else {
+#endif
+ /*
+ * Clear all 6 debug registers:
+ */
+
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+ set_debugreg(0UL, 6);
+ set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+ /* If the kgdb is connected no debug regs should be altered. */
+ }
+#endif
+
+ fpu_init();
+
+ raw_local_save_flags(kernel_eflags);
+
+ if (is_uv_system())
+ uv_cpu_init();
+}
--- /dev/null
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ * Memory region support
+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ * Added E820 sanitization routine (removes overlapping memory regions);
+ * Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ * Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/sections.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/io.h>
+#include <asm/hypervisor.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/memory.h>
+#include <xen/features.h>
+#include <xen/firmware.h>
+#include <xen/xencons.h>
+#include <setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+ xen_panic_event, NULL, 0 /* try to go last */
+};
+
+extern char hypercall_page[PAGE_SIZE];
+EXPORT_SYMBOL(hypercall_page);
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+ .name = "dma1",
+ .start = 0x0000,
+ .end = 0x001f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "pic1",
+ .start = 0x0020,
+ .end = 0x0021,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "timer0",
+ .start = 0x0040,
+ .end = 0x0043,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "timer1",
+ .start = 0x0050,
+ .end = 0x0053,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "keyboard",
+ .start = 0x0060,
+ .end = 0x0060,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "keyboard",
+ .start = 0x0064,
+ .end = 0x0064,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "dma page reg",
+ .start = 0x0080,
+ .end = 0x008f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "pic2",
+ .start = 0x00a0,
+ .end = 0x00a1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "dma2",
+ .start = 0x00c0,
+ .end = 0x00df,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "fpu",
+ .start = 0x00f0,
+ .end = 0x00ff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+} };
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+EXPORT_SYMBOL(boot_cpu_data);
+
+unsigned int def_to_bigsmp;
+
+#ifndef CONFIG_X86_PAE
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+#ifndef CONFIG_XEN
+#define copy_edid() (edid_info = boot_params.edid_info)
+#endif
+struct ist_info ist_info;
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+EXPORT_SYMBOL(ist_info);
+#endif
+
+extern void early_cpu_init(void);
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+
+/*
+ * Point at the empty zero page to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+unsigned long *phys_to_machine_mapping;
+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/* Raw start-of-day parameters from the hypervisor. */
+start_info_t *xen_start_info;
+EXPORT_SYMBOL(xen_start_info);
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+#ifndef CONFIG_XEN
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ * from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+ sizeof(edd.mbr_signature));
+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+ edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#endif
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+int __initdata user_defined_memmap;
+
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem= [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "nopentium") == 0) {
+ setup_clear_cpu_cap(X86_FEATURE_PSE);
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+ * that size. exactmap can be used to specify
+ * the exact map. mem=number can be used to
+ * trim the existing memory map.
+ */
+ unsigned long long mem_size;
+
+ mem_size = memparse(arg, &arg);
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
+ }
+ return 0;
+}
+early_param("mem", parse_mem);
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ elfcorehdr_addr = memparse(arg, &arg);
+ return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ __VMALLOC_RESERVE = memparse(arg, &arg);
+ return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+#ifndef CONFIG_XEN
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+ unsigned long address;
+
+ if (!arg)
+ return -EINVAL;
+
+ address = memparse(arg, &arg);
+ reserve_top_address(address);
+ return 0;
+}
+early_param("reservetop", parse_reservetop);
+#endif
+
+/*
+ * Determine low and high memory ranges:
+ */
+unsigned long __init find_max_low_pfn(void)
+{
+ unsigned long max_low_pfn;
+
+ max_low_pfn = max_pfn;
+ if (max_low_pfn > MAXMEM_PFN) {
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+ MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING "Warning only 4GB will be used.\n");
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+ }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+ } else {
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
+ printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+ }
+ return max_low_pfn;
+}
+
+#ifndef CONFIG_XEN
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+static void __init reserve_ebda_region(void)
+{
+ unsigned int lowmem, ebda_addr;
+
+ /* To determine the position of the EBDA and the */
+ /* end of conventional memory, we need to look at */
+ /* the BIOS data area. In a paravirtual environment */
+ /* that area is absent. We'll just have to assume */
+ /* that the paravirt case can handle memory setup */
+ /* correctly, without our help. */
+ if (paravirt_enabled())
+ return;
+
+ /* end of low (conventional) memory */
+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+ lowmem <<= 10;
+
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
+ if ((lowmem - ebda_addr) <= 0x10000)
+ lowmem = ebda_addr;
+
+ /* Fixup: bios does not report an EBDA at all. */
+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+ lowmem = 0x9f000;
+
+ /* Paranoia: should never happen, but... */
+ if ((lowmem == 0) || (lowmem >= 0x100000))
+ lowmem = 0x9f000;
+
+ /* reserve all memory between lowmem and the 1MB mark */
+ reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
+}
+#endif
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+static void __init setup_bootmem_allocator(void);
+static unsigned long __init setup_memory(void)
+{
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
+ xen_start_info->nr_pt_frames;
+
+ max_low_pfn = find_max_low_pfn();
+
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn) {
+ highstart_pfn = max_low_pfn;
+ }
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ num_physpages = highend_pfn;
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ num_physpages = max_low_pfn;
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+ max_mapnr = num_physpages;
+#endif
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+
+ setup_bootmem_allocator();
+
+ return max_low_pfn;
+}
+
+static void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ max_zone_pfns[ZONE_DMA] =
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+ add_active_range(0, 0, highend_pfn);
+#else
+ add_active_range(0, 0, max_low_pfn);
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+#else
+extern unsigned long __init setup_memory(void);
+extern void zone_sizes_init(void);
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+static inline unsigned long long get_total_mem(void)
+{
+ unsigned long long total;
+
+ total = max_low_pfn - min_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ total += highend_pfn - highstart_pfn;
+#endif
+
+ return total << PAGE_SHIFT;
+}
+
+#ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
+static void __init reserve_crashkernel(void)
+{
+ unsigned long long total_mem;
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ total_mem = get_total_mem();
+
+ ret = parse_crashkernel(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret == 0 && crash_size > 0) {
+ if (crash_base > 0) {
+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+ "for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+
+ if (reserve_bootmem(crash_base, crash_size,
+ BOOTMEM_EXCLUSIVE) < 0) {
+ printk(KERN_INFO "crashkernel reservation "
+ "failed - memory is in use\n");
+ return;
+ }
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ } else
+ printk(KERN_INFO "crashkernel reservation failed - "
+ "you have to specify a base address\n");
+ }
+}
+#else
+#define reserve_crashkernel xen_machine_kexec_setup_resources
+#endif
+#else
+static inline void __init reserve_crashkernel(void)
+{}
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+static bool do_relocate_initrd = false;
+
+static void __init reserve_initrd(void)
+{
+ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
+ unsigned long ramdisk_size = xen_start_info->mod_len;
+ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ unsigned long ramdisk_here;
+
+ initrd_start = 0;
+
+ if (!xen_start_info->mod_start || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ if (ramdisk_end < ramdisk_image) {
+ printk(KERN_ERR "initrd wraps around end of memory, "
+ "disabling initrd\n");
+ return;
+ }
+ if (ramdisk_size >= end_of_lowmem/2) {
+ printk(KERN_ERR "initrd too large to handle, "
+ "disabling initrd\n");
+ return;
+ }
+ if (ramdisk_end <= end_of_lowmem) {
+ /* All in lowmem, easy case */
+ reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start+ramdisk_size;
+ return;
+ }
+
+ /* We need to move the initrd down into lowmem */
+ ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+
+ /* Note: this includes all the lowmem currently occupied by
+ the initrd, we rely on that fact to keep the data intact. */
+ reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
+ initrd_start = ramdisk_here + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+
+ do_relocate_initrd = true;
+}
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+static void __init relocate_initrd(void)
+{
+ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+ unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ unsigned long ramdisk_here;
+ unsigned long slop, clen, mapaddr;
+ char *p, *q;
+
+ if (!do_relocate_initrd)
+ return;
+
+ ramdisk_here = initrd_start - PAGE_OFFSET;
+
+ q = (char *)initrd_start;
+
+ /* Copy any lowmem portion of the initrd */
+ if (ramdisk_image < end_of_lowmem) {
+ clen = end_of_lowmem - ramdisk_image;
+ p = (char *)__va(ramdisk_image);
+ memcpy(q, p, clen);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+
+ /* Copy the highmem portion of the initrd */
+ while (ramdisk_size) {
+ slop = ramdisk_image & ~PAGE_MASK;
+ clen = ramdisk_size;
+ if (clen > MAX_MAP_CHUNK-slop)
+ clen = MAX_MAP_CHUNK-slop;
+ mapaddr = ramdisk_image & PAGE_MASK;
+ p = early_ioremap(mapaddr, clen+slop);
+ memcpy(q, p+slop, clen);
+ early_iounmap(p, clen+slop);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+}
+
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+void __init setup_bootmem_allocator(void)
+{
+ unsigned long bootmap_size;
+ /*
+ * Initialize the boot-time allocator (with low memory only):
+ */
+ bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
+
+ register_bootmem_low_pages(max_low_pfn);
+
+ /*
+ * Reserve the bootmem bitmap itself as well. We do this in two
+ * steps (first step was init_bootmem()) because this catches
+ * the (very unlikely) case of us accidentally initializing the
+ * bootmem allocator with an invalid RAM area.
+ */
+ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
+ bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
+ BOOTMEM_DEFAULT);
+
+#ifndef CONFIG_XEN
+ /*
+ * reserve physical page 0 - it's a special BIOS page on many boxes,
+ * enabling clean reboots, SMP operation, laptop functions.
+ */
+ reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
+
+ /* reserve EBDA region */
+ reserve_ebda_region();
+
+#ifdef CONFIG_SMP
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+ */
+ acpi_reserve_bootmem();
+#endif
+#endif /* !CONFIG_XEN */
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ reserve_initrd();
+#endif
+ numa_kva_reserve();
+ reserve_crashkernel();
+
+ reserve_ibft_region();
+}
+
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem. node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+static void __init remapped_pgdat_init(void)
+{
+ int nid;
+
+ for_each_online_node(nid) {
+ if (nid != 0)
+ memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ }
+}
+
+#ifdef CONFIG_MCA
+static void set_mca_bus(int x)
+{
+ MCA_bus = x;
+}
+#else
+static void set_mca_bus(int x) { }
+#endif
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+ return machine_specific_memory_setup();
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * In the golden day, when everything among i386 and x86_64 will be
+ * integrated, this will not live here
+ */
+void *x86_cpu_to_node_map_early_ptr;
+int x86_cpu_to_node_map_init[NR_CPUS] = {
+ [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
+#endif
+
+/*
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization. Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+ int i, j, k, fpp;
+ struct physdev_set_iopl set_iopl;
+ unsigned long max_low_pfn;
+ unsigned long p2m_pages;
+
+ /* Force a quick death if the kernel panics (not domain 0). */
+ extern int panic_timeout;
+ if (!is_initial_xendomain()) {
+ if (!panic_timeout)
+ panic_timeout = 1;
+
+ /* Register a call for panic conditions. */
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ }
+
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_4gb_segments));
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_writable_pagetables));
+
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+ pre_setup_arch_hook();
+ early_cpu_init();
+ early_ioremap_init();
+#ifdef CONFIG_SMP
+ prefill_possible_map();
+#endif
+
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ "EL32", 4))
+ efi_enabled = 1;
+#endif
+
+ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+ */
+ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
+ screen_info = boot_params.screen_info;
+ copy_edid();
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+ saved_video_mode = boot_params.hdr.vid_mode;
+ if( boot_params.sys_desc_table.length != 0 ) {
+ set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+ machine_id = boot_params.sys_desc_table.table[0];
+ machine_submodel_id = boot_params.sys_desc_table.table[1];
+ BIOS_revision = boot_params.sys_desc_table.table[2];
+ }
+ bootloader_type = boot_params.hdr.type_of_loader;
+
+ if (is_initial_xendomain()) {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+
+ dom0_init_screen_info(info,
+ xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+ } else
+ screen_info.orig_video_isVGA = 0;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+
+ ARCH_SETUP
+
+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ print_memory_map(memory_setup());
+
+ copy_edd();
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+ init_mm.start_code = (unsigned long) _text;
+ init_mm.end_code = (unsigned long) _etext;
+ init_mm.end_data = (unsigned long) _edata;
+ init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
+ xen_start_info->nr_pt_frames) << PAGE_SHIFT;
+
+ code_resource.start = virt_to_phys(_text);
+ code_resource.end = virt_to_phys(_etext)-1;
+ data_resource.start = virt_to_phys(_etext);
+ data_resource.end = virt_to_phys(_edata)-1;
+ bss_resource.start = virt_to_phys(&__bss_start);
+ bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
+ if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+ i = COMMAND_LINE_SIZE;
+ memcpy(boot_command_line, xen_start_info->cmd_line, i);
+ boot_command_line[i - 1] = '\0';
+ parse_early_param();
+
+ if (user_defined_memmap) {
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ print_memory_map("user");
+ }
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ if (efi_enabled)
+ efi_init();
+
+ /* update e820 for memory not covered by WB MTRRs */
+ propagate_e820_map();
+ mtrr_bp_init();
+#ifndef CONFIG_XEN
+ if (mtrr_trim_uncached_memory(max_pfn))
+ propagate_e820_map();
+#endif
+
+ max_low_pfn = setup_memory();
+
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
+#ifdef CONFIG_VMI
+ /*
+ * Must be after max_low_pfn is determined, and before kernel
+ * pagetables are setup.
+ */
+ vmi_init();
+#endif
+ kvm_guest_init();
+
+ /*
+ * NOTE: before this point _nobody_ is allowed to allocate
+ * any memory using the bootmem allocator. Although the
+ * allocator is now initialised only the first 8Mb of the kernel
+ * virtual address space has been mapped. All allocations before
+ * paging_init() has completed must use the alloc_bootmem_low_pages()
+ * variant (which allocates DMA'able memory) and care must be taken
+ * not to exceed the 8Mb limit.
+ */
+
+#ifdef CONFIG_SMP
+ smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
+#endif
+ paging_init();
+
+ /*
+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+ */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ if (init_ohci1394_dma_early)
+ init_ohci1394_dma_on_all_controllers();
+#endif
+
+ remapped_pgdat_init();
+ sparse_init();
+ zone_sizes_init();
+
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+#endif
+
+ p2m_pages = max_pfn;
+ if (xen_start_info->nr_pages > max_pfn) {
+ /*
+ * the max_pfn was shrunk (probably by mem= or highmem=
+ * kernel parameter); shrink reservation with the HV
+ */
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned int difference;
+ int ret;
+
+ difference = xen_start_info->nr_pages - max_pfn;
+
+ set_xen_guest_handle(reservation.extent_start,
+ ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
+ reservation.nr_extents = difference;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ BUG_ON (ret != difference);
+ }
+ else if (max_pfn > xen_start_info->nr_pages)
+ p2m_pages = xen_start_info->nr_pages;
+
+ /* Make sure we have a correctly sized P->M table. */
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ phys_to_machine_mapping = alloc_bootmem_low_pages(
+ max_pfn * sizeof(unsigned long));
+ memset(phys_to_machine_mapping, ~0,
+ max_pfn * sizeof(unsigned long));
+ memcpy(phys_to_machine_mapping,
+ (unsigned long *)xen_start_info->mfn_list,
+ p2m_pages * sizeof(unsigned long));
+ free_bootmem(
+ __pa(xen_start_info->mfn_list),
+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+ sizeof(unsigned long))));
+
+ /*
+ * Initialise the list of the frames that specify the list of
+ * frames that make up the p2m table. Used by save/restore
+ */
+ pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
+
+ fpp = PAGE_SIZE/sizeof(unsigned long);
+ for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
+ if ((j % fpp) == 0) {
+ k++;
+ BUG_ON(k>=16);
+ pfn_to_mfn_frame_list[k] =
+ alloc_bootmem_low_pages(PAGE_SIZE);
+ pfn_to_mfn_frame_list_list[k] =
+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
+ j=0;
+ }
+ pfn_to_mfn_frame_list[k][j] =
+ virt_to_mfn(&phys_to_machine_mapping[i]);
+ }
+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(pfn_to_mfn_frame_list_list);
+ }
+
+ /* Mark all ISA DMA channels in-use - using them wouldn't work. */
+ for (i = 0; i < MAX_DMA_CHANNELS; ++i)
+ if (i != 4 && request_dma(i, "xen") != 0)
+ BUG();
+
+ /*
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ relocate_initrd();
+#endif
+
+ paravirt_post_allocator_init();
+
+ if (is_initial_xendomain())
+ dmi_scan_machine();
+
+ io_delay_init();
+
+#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
+ /*
+ * setup to use the early static init tables during kernel startup
+ * X86_SMP will exclude sub-arches that don't deal well with it.
+ */
+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
+#ifdef CONFIG_NUMA
+ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
+#endif
+#endif
+
+#ifdef CONFIG_X86_GENERICARCH
+ generic_apic_probe();
+#endif
+
+ set_iopl.iopl = 1;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+
+#ifdef CONFIG_ACPI
+ if (!is_initial_xendomain()) {
+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+ acpi_disabled = 1;
+ acpi_ht = 0;
+ }
+
+ /*
+ * Parse the ACPI tables for possible boot-time SMP configuration.
+ */
+ acpi_boot_table_init();
+#endif
+
+#ifndef CONFIG_XEN
+ early_quirks();
+#endif
+
+#ifdef CONFIG_ACPI
+ acpi_boot_init();
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+ if (def_to_bigsmp)
+ printk(KERN_WARNING "More than 8 CPUs detected and "
+ "CONFIG_X86_PC cannot handle it.\nUse "
+ "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (smp_found_config)
+ get_smp_config();
+#endif
+
+ e820_register_memory();
+ e820_mark_nosave_regions();
+
+ if (is_initial_xendomain()) {
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+ if (!efi_enabled ||
+ (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+#endif
+ } else {
+#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+ }
+}
+
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ HYPERVISOR_shutdown(SHUTDOWN_crash);
+ /* we're never actually going to get here... */
+ return NOTIFY_DONE;
+}
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+ int i;
+
+ /* Nothing to do if not running in dom0. */
+ if (!is_initial_xendomain())
+ return 0;
+
+ printk(KERN_INFO "Setting up standard PCI resources\n");
+ init_iomem_resources(&code_resource, &data_resource, &bss_resource);
+
+ request_resource(&iomem_resource, &video_ram_resource);
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+ return 0;
+}
+
+subsys_initcall(request_standard_resources);
--- /dev/null
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <linux/console.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+#include <linux/root_dev.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/efi.h>
+#include <linux/acpi.h>
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/mmzone.h>
+#include <linux/kexec.h>
+#include <linux/cpufreq.h>
+#include <linux/dmi.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/sort.h>
+#include <linux/uaccess.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/vsyscall.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <video/edid.h>
+#include <asm/e820.h>
+#include <asm/dma.h>
+#include <asm/gart.h>
+#include <asm/mpspec.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/numa.h>
+#include <asm/sections.h>
+#include <asm/dmi.h>
+#include <asm/cacheflush.h>
+#include <asm/mce.h>
+#include <asm/ds.h>
+#include <asm/topology.h>
+#include <asm/pat.h>
+
+#include <mach_apic.h>
+#ifdef CONFIG_XEN
+#include <linux/percpu.h>
+#include <xen/interface/physdev.h>
+#include "setup_arch_pre.h"
+#include <asm/hypervisor.h>
+#include <xen/interface/nmi.h>
+#include <xen/features.h>
+#include <xen/firmware.h>
+#include <xen/xencons.h>
+#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
+#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
+#include <asm/mach-xen/setup_arch_post.h>
+#include <xen/interface/memory.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
+extern unsigned long start_pfn;
+extern struct edid_info edid_info;
+
+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+extern char hypercall_page[PAGE_SIZE];
+EXPORT_SYMBOL(hypercall_page);
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+ xen_panic_event, NULL, 0 /* try to go last */
+};
+
+unsigned long *phys_to_machine_mapping;
+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
+
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
+DEFINE_PER_CPU(int, nr_multicall_ents);
+
+/* Raw start-of-day parameters from the hypervisor. */
+start_info_t *xen_start_info;
+EXPORT_SYMBOL(xen_start_info);
+#endif
+
+/*
+ * Machine setup..
+ */
+
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+unsigned long mmu_cr4_features;
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+unsigned long saved_video_mode;
+
+int force_mwait __cpuinitdata;
+
+/*
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct sys_desc_table_struct {
+ unsigned short length;
+ unsigned char table[0];
+};
+
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+char __initdata command_line[COMMAND_LINE_SIZE];
+
+static struct resource standard_io_resources[] = {
+ { .name = "dma1", .start = 0x00, .end = 0x1f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic1", .start = 0x20, .end = 0x21,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer0", .start = 0x40, .end = 0x43,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer1", .start = 0x50, .end = 0x53,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x60, .end = 0x60,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x64, .end = 0x64,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "fpu", .start = 0xf0, .end = 0xff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
+
+static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_RAM,
+};
+static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_RAM,
+};
+static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_RAM,
+};
+
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+
+#ifndef CONFIG_NUMA
+static void __init
+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long bootmap_size, bootmap;
+
+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+ PAGE_SIZE);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+ e820_register_active_regions(0, start_pfn, end_pfn);
+#ifdef CONFIG_XEN
+ free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
+ early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
+#else
+ free_bootmem_with_active_regions(0, end_pfn);
+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+#endif
+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+}
+#endif
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+#ifndef CONFIG_XEN
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ * from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+ sizeof(edd.mbr_signature));
+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+ edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#endif
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+#ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
+static void __init reserve_crashkernel(void)
+{
+ unsigned long long total_mem;
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
+
+ ret = parse_crashkernel(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret == 0 && crash_size) {
+ if (crash_base <= 0) {
+ printk(KERN_INFO "crashkernel reservation failed - "
+ "you have to specify a base address\n");
+ return;
+ }
+
+ if (reserve_bootmem(crash_base, crash_size,
+ BOOTMEM_EXCLUSIVE) < 0) {
+ printk(KERN_INFO "crashkernel reservation failed - "
+ "memory is in use\n");
+ return;
+ }
+
+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+ "for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
+ }
+}
+#else
+#define reserve_crashkernel xen_machine_kexec_setup_resources
+#endif
+#else
+static inline void __init reserve_crashkernel(void)
+{}
+#endif
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) __init memory_setup(void)
+{
+ machine_specific_memory_setup();
+}
+
+static void __init parse_setup_data(void)
+{
+ struct setup_data *data;
+ unsigned long pa_data;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, PAGE_SIZE);
+ switch (data->type) {
+ default:
+ break;
+ }
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+ free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+ pa_data = data->next;
+ early_iounmap(data, PAGE_SIZE);
+ }
+}
+
+#ifdef CONFIG_PCI_MMCONFIG
+extern void __cpuinit fam10h_check_enable_mmcfg(void);
+extern void __init check_enable_amd_mmconf_dmi(void);
+#else
+void __cpuinit fam10h_check_enable_mmcfg(void)
+{
+}
+void __init check_enable_amd_mmconf_dmi(void)
+{
+}
+#endif
+
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+ unsigned i;
+
+#ifdef CONFIG_XEN
+ extern struct e820map machine_e820;
+
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_writable_pagetables));
+
+ early_ioremap_init();
+
+ ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
+ screen_info = boot_params.screen_info;
+
+ if (is_initial_xendomain()) {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+
+ dom0_init_screen_info(info,
+ xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+ } else {
+ screen_info.orig_video_isVGA = 0;
+
+ /* Register a call for panic conditions. */
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ }
+
+ copy_edid();
+#else
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#endif /* !CONFIG_XEN */
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ "EL64", 4))
+ efi_enabled = 1;
+#endif
+
+ ARCH_SETUP
+
+ memory_setup();
+ copy_edd();
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+ init_mm.start_code = (unsigned long) &_text;
+ init_mm.end_code = (unsigned long) &_etext;
+ init_mm.end_data = (unsigned long) &_edata;
+ init_mm.brk = (unsigned long) &_end;
+
+ code_resource.start = virt_to_phys(&_text);
+ code_resource.end = virt_to_phys(&_etext)-1;
+ data_resource.start = virt_to_phys(&_etext);
+ data_resource.end = virt_to_phys(&_edata)-1;
+ bss_resource.start = virt_to_phys(&__bss_start);
+ bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
+ early_identify_cpu(&boot_cpu_data);
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ parse_setup_data();
+
+ parse_early_param();
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ if (init_ohci1394_dma_early)
+ init_ohci1394_dma_on_all_controllers();
+#endif
+
+ finish_e820_parsing();
+
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+
+ early_gart_iommu_check();
+
+ e820_register_active_regions(0, 0, -1UL);
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ end_pfn = e820_end_of_ram();
+ /* update e820 for memory not covered by WB MTRRs */
+ mtrr_bp_init();
+#ifndef CONFIG_XEN
+ if (mtrr_trim_uncached_memory(end_pfn)) {
+ e820_register_active_regions(0, 0, -1UL);
+ end_pfn = e820_end_of_ram();
+ }
+#endif
+
+ num_physpages = end_pfn;
+ max_mapnr = end_pfn;
+
+ check_efer();
+
+ max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
+ if (efi_enabled)
+ efi_init();
+
+#ifndef CONFIG_XEN
+ vsmp_init();
+#endif
+
+ if (is_initial_xendomain())
+ dmi_scan_machine();
+
+ io_delay_init();
+
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+ /* setup to use the early static init tables during kernel startup */
+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
+#ifdef CONFIG_NUMA
+ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
+#endif
+#endif
+
+ /* How many end-of-memory variables you have, grandma! */
+ max_low_pfn = end_pfn;
+ max_pfn = end_pfn;
+ high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
+
+ /* Remove active ranges so rediscovery with NUMA-awareness happens */
+ remove_all_active_ranges();
+
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * Parse SRAT to discover nodes.
+ */
+ acpi_numa_init();
+#endif
+
+#ifdef CONFIG_NUMA
+ numa_initmem_init(0, end_pfn);
+#else
+ contig_initmem_init(0, end_pfn);
+#endif
+
+#ifndef CONFIG_XEN
+ dma32_reserve_bootmem();
+
+#ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+ */
+ acpi_reserve_bootmem();
+#endif
+
+ if (efi_enabled)
+ efi_reserve_bootmem();
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_XEN
+ if (xen_start_info->mod_start) {
+ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
+ unsigned long ramdisk_size = xen_start_info->mod_len;
+#else
+ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+ unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
+#endif
+ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
+ unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
+
+ if (ramdisk_end <= end_of_mem) {
+ /*
+ * don't need to reserve again, already reserved early
+ * in x86_64_start_kernel, and early_res_to_bootmem
+ * convert that to reserved in bootmem
+ */
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start+ramdisk_size;
+#ifdef CONFIG_XEN
+ initrd_below_start_ok = 1;
+#endif
+ } else {
+ free_bootmem(ramdisk_image, ramdisk_size);
+ printk(KERN_ERR "initrd extends beyond end of memory "
+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+ ramdisk_end, end_of_mem);
+ initrd_start = 0;
+ }
+ }
+#endif
+ reserve_crashkernel();
+
+ reserve_ibft_region();
+
+ paging_init();
+ map_vsyscall();
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+#endif
+#ifdef CONFIG_XEN
+ {
+ int i, j, k, fpp;
+ unsigned long p2m_pages;
+
+ p2m_pages = end_pfn;
+ if (xen_start_info->nr_pages > end_pfn) {
+ /*
+ * the end_pfn was shrunk (probably by mem= or highmem=
+ * kernel parameter); shrink reservation with the HV
+ */
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned int difference;
+ int ret;
+
+ difference = xen_start_info->nr_pages - end_pfn;
+
+ set_xen_guest_handle(reservation.extent_start,
+ ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
+ reservation.nr_extents = difference;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ BUG_ON (ret != difference);
+ }
+ else if (end_pfn > xen_start_info->nr_pages)
+ p2m_pages = xen_start_info->nr_pages;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Make sure we have a large enough P->M table. */
+ phys_to_machine_mapping = alloc_bootmem_pages(
+ end_pfn * sizeof(unsigned long));
+ memset(phys_to_machine_mapping, ~0,
+ end_pfn * sizeof(unsigned long));
+ memcpy(phys_to_machine_mapping,
+ (unsigned long *)xen_start_info->mfn_list,
+ p2m_pages * sizeof(unsigned long));
+ free_bootmem(
+ __pa(xen_start_info->mfn_list),
+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+ sizeof(unsigned long))));
+
+ /*
+ * Initialise the list of the frames that specify the
+ * list of frames that make up the p2m table. Used by
+ * save/restore.
+ */
+ pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
+
+ fpp = PAGE_SIZE/sizeof(unsigned long);
+ for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
+ if ((j % fpp) == 0) {
+ k++;
+ BUG_ON(k>=fpp);
+ pfn_to_mfn_frame_list[k] =
+ alloc_bootmem_pages(PAGE_SIZE);
+ pfn_to_mfn_frame_list_list[k] =
+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
+ j=0;
+ }
+ pfn_to_mfn_frame_list[k][j] =
+ virt_to_mfn(&phys_to_machine_mapping[i]);
+ }
+ HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(pfn_to_mfn_frame_list_list);
+ }
+
+ /* Mark all ISA DMA channels in-use - using them wouldn't work. */
+ for (i = 0; i < MAX_DMA_CHANNELS; ++i)
+ if (i != 4 && request_dma(i, "xen") != 0)
+ BUG();
+ }
+
+#ifdef CONFIG_ACPI
+ if (!is_initial_xendomain()) {
+ acpi_disabled = 1;
+ acpi_ht = 0;
+ }
+#endif
+#endif
+
+#ifndef CONFIG_XEN
+ early_quirks();
+#endif
+
+#ifdef CONFIG_ACPI
+ /*
+ * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
+ * Call this early for SRAT node setup.
+ */
+ acpi_boot_table_init();
+
+ /*
+ * Read APIC and some other early information from ACPI tables.
+ */
+ acpi_boot_init();
+#endif
+
+ init_cpu_to_node();
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ get_smp_config();
+#ifndef CONFIG_XEN
+ init_apic_mappings();
+ ioapic_init_mappings();
+#endif
+#endif
+#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
+ prefill_possible_map();
+#endif
+
+ kvm_guest_init();
+
+ /*
+ * We trust e820 completely. No explicit ROM probing in memory.
+ */
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain())
+ e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
+#else
+ e820_reserve_resources(e820.map, e820.nr_map);
+ e820_mark_nosave_regions();
+#endif
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain())
+ e820_setup_gap(machine_e820.map, machine_e820.nr_map);
+#else
+ e820_setup_gap(e820.map, e820.nr_map);
+#endif
+
+#ifdef CONFIG_XEN
+ {
+ struct physdev_set_iopl set_iopl;
+
+ set_iopl.iopl = 1;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+
+ if (is_initial_xendomain()) {
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+ conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+#endif
+ } else {
+#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+ }
+ }
+#else /* CONFIG_XEN */
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+#endif
+
+#endif /* !CONFIG_XEN */
+
+ /* do this before identify_cpu for boot cpu */
+ check_enable_amd_mmconf_dmi();
+}
+
+#ifdef CONFIG_XEN
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ HYPERVISOR_shutdown(SHUTDOWN_crash);
+ /* we're never actually going to get here... */
+ return NOTIFY_DONE;
+}
+#endif /* !CONFIG_XEN */
+
+
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+ unsigned int *v;
+
+ if (c->extended_cpuid_level < 0x80000004)
+ return 0;
+
+ v = (unsigned int *) c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+ return 1;
+}
+
+
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+ unsigned int n, dummy, eax, ebx, ecx, edx;
+
+ n = c->extended_cpuid_level;
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+ "D cache %dK (%d bytes/line)\n",
+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+ c->x86_cache_size = (ecx>>24) + (edx>>24);
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+ }
+
+ if (n >= 0x80000006) {
+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+ ecx = cpuid_ecx(0x80000006);
+ c->x86_cache_size = ecx >> 16;
+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+ c->x86_cache_size, ecx & 0xFF);
+ }
+ if (n >= 0x80000008) {
+ cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+}
+
+#ifdef CONFIG_NUMA
+static int __cpuinit nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits;
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node = 0;
+ unsigned apicid = hard_smp_processor_id();
+#endif
+ bits = c->x86_coreid_bits;
+
+ /* Low order bits define the core id (index of core in socket) */
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
+
+#ifdef CONFIG_NUMA
+ node = c->phys_proc_id;
+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
+ node = apicid_to_node[apicid];
+ if (!node_online(node)) {
+ /* Two possibilities here:
+ - The CPU is missing memory and no node was created.
+ In that case try picking one from a nearby CPU
+ - The APIC IDs differ from the HyperTransport node IDs
+ which the K8 northbridge parsing fills in.
+ Assume they are all increased by a constant offset,
+ but in the same order as the HT nodeids.
+ If that doesn't result in a usable node fall back to the
+ path for the previous case. */
+
+ int ht_nodeid = c->initial_apicid;
+
+ if (ht_nodeid >= 0 &&
+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits, ecx;
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+
+ c->x86_coreid_bits = bits;
+
+#endif
+}
+
+#define ENABLE_C1E_MASK 0x18000000
+#define CPUID_PROCESSOR_SIGNATURE 1
+#define CPUID_XFAM 0x0ff00000
+#define CPUID_XFAM_K8 0x00000000
+#define CPUID_XFAM_10H 0x00100000
+#define CPUID_XFAM_11H 0x00200000
+#define CPUID_XMOD 0x000f0000
+#define CPUID_XMOD_REV_F 0x00040000
+
+#ifndef CONFIG_XEN
+/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
+static __cpuinit int amd_apic_timer_broken(void)
+{
+ u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+
+ switch (eax & CPUID_XFAM) {
+ case CPUID_XFAM_K8:
+ if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
+ break;
+ case CPUID_XFAM_10H:
+ case CPUID_XFAM_11H:
+ rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
+ if (lo & ENABLE_C1E_MASK)
+ return 1;
+ break;
+ default:
+ /* err on the side of caution */
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+ early_init_amd_mc(c);
+
+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+ if (c->x86_power & (1<<8))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+ unsigned level;
+
+#ifdef CONFIG_SMP
+ unsigned long value;
+
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 15) {
+ rdmsrl(MSR_K8_HWCR, value);
+ value |= 1 << 6;
+ wrmsrl(MSR_K8_HWCR, value);
+ }
+#endif
+
+ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+ clear_cpu_cap(c, 0*32+31);
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
+ level >= 0x0f58))
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ if (c->x86 == 0x10 || c->x86 == 0x11)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+
+ level = get_model_name(c);
+ if (!level) {
+ switch (c->x86) {
+ case 15:
+ /* Should distinguish Models here, but this is only
+ a fallback anyways. */
+ strcpy(c->x86_model_id, "Hammer");
+ break;
+ }
+ }
+ display_cacheinfo(c);
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level >= 0x80000008)
+ amd_detect_cmp(c);
+
+ if (c->extended_cpuid_level >= 0x80000006 &&
+ (cpuid_edx(0x80000006) & 0xf000))
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+
+ if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ /* MFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
+ if (c->x86 == 0x10)
+ fam10h_check_enable_mmcfg();
+
+#ifndef CONFIG_XEN
+ if (amd_apic_timer_broken())
+ disable_apic_timer = 1;
+
+ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
+ (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+#endif
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ u32 eax, ebx, ecx, edx;
+ int index_msb, core_bits;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return;
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ goto out;
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+ if (smp_num_siblings == 1) {
+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+ } else if (smp_num_siblings > 1) {
+
+ if (smp_num_siblings > NR_CPUS) {
+ printk(KERN_WARNING "CPU: Unsupported number of "
+ "siblings %d", smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
+
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = phys_pkg_id(index_msb);
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = phys_pkg_id(index_msb) &
+ ((1 << core_bits) - 1);
+ }
+out:
+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ }
+
+#endif
+}
+
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, t;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_count(4, 0, &eax, &t, &t, &t);
+
+ if (eax & 0x1f)
+ return ((eax >> 26) + 1);
+ else
+ return 1;
+}
+
+static void __cpuinit srat_detect_node(void)
+{
+#ifdef CONFIG_NUMA
+ unsigned node;
+ int cpu = smp_processor_id();
+ int apicid = hard_smp_processor_id();
+
+ /* Don't do the funky fallback heuristics the AMD version employs
+ for now. */
+ node = apicid_to_node[apicid];
+ if (node == NUMA_NO_NODE || !node_online(node))
+ node = first_node(node_online_map);
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+ /* Cache sizes */
+ unsigned n;
+
+ init_intel_cacheinfo(c);
+ if (c->cpuid_level > 9) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (cpu_has_ds) {
+ unsigned int l1, l2;
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+ set_cpu_cap(c, X86_FEATURE_BTS);
+ if (!(l1 & (1<<12)))
+ set_cpu_cap(c, X86_FEATURE_PEBS);
+ }
+
+
+ if (cpu_has_bts)
+ ds_init_intel(c);
+
+ n = c->extended_cpuid_level;
+ if (n >= 0x80000008) {
+ unsigned eax = cpuid_eax(0x80000008);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ /* CPUID workaround for Intel 0F34 CPU */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ c->x86 == 0xF && c->x86_model == 0x3 &&
+ c->x86_mask == 0x4)
+ c->x86_phys_bits = 36;
+ }
+
+ if (c->x86 == 15)
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ c->x86_max_cores = intel_num_cpu_cores(c);
+
+ srat_detect_node();
+}
+
+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+ if (c->x86 == 0x6 && c->x86_model >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+ /* Cache sizes */
+ unsigned n;
+
+ n = c->extended_cpuid_level;
+ if (n >= 0x80000008) {
+ unsigned eax = cpuid_eax(0x80000008);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+ char *v = c->x86_vendor_id;
+
+ if (!strcmp(v, "AuthenticAMD"))
+ c->x86_vendor = X86_VENDOR_AMD;
+ else if (!strcmp(v, "GenuineIntel"))
+ c->x86_vendor = X86_VENDOR_INTEL;
+ else if (!strcmp(v, "CentaurHauls"))
+ c->x86_vendor = X86_VENDOR_CENTAUR;
+ else
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+
+/* Do some early cpuid on the boot CPU to get some parameter that are
+ needed before check_bugs. Everything advanced is in identify_cpu
+ below. */
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+{
+ u32 tfms, xlvl;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = -1;
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_vendor_id[0] = '\0'; /* Unset */
+ c->x86_model_id[0] = '\0'; /* Unset */
+ c->x86_clflush_size = 64;
+ c->x86_cache_alignment = c->x86_clflush_size;
+ c->x86_max_cores = 1;
+ c->x86_coreid_bits = 0;
+ c->extended_cpuid_level = 0;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ get_cpu_vendor(c);
+
+ /* Initialize the standard set of capabilities */
+ /* Note that the vendor-specific code below might override */
+
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ __u32 misc;
+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+ &c->x86_capability[0]);
+ c->x86 = (tfms >> 8) & 0xf;
+ c->x86_model = (tfms >> 4) & 0xf;
+ c->x86_mask = tfms & 0xf;
+ if (c->x86 == 0xf)
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ } else {
+ /* Have CPUID level 0 only - unheard of */
+ c->x86 = 4;
+ }
+
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
+#ifdef CONFIG_SMP
+ c->phys_proc_id = c->initial_apicid;
+#endif
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = xlvl;
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
+ }
+ if (xlvl >= 0x80000004)
+ get_model_name(c); /* Default name */
+ }
+
+ /* Transmeta-defined flags: level 0x80860001 */
+ xlvl = cpuid_eax(0x80860000);
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ /* Don't set x86_cpuid_level here for now to not confuse. */
+ if (xlvl >= 0x80860001)
+ c->x86_capability[2] = cpuid_edx(0x80860001);
+ }
+
+ c->extended_cpuid_level = cpuid_eax(0x80000000);
+ if (c->extended_cpuid_level >= 0x80000007)
+ c->x86_power = cpuid_edx(0x80000007);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ early_init_amd(c);
+ break;
+ case X86_VENDOR_INTEL:
+ early_init_intel(c);
+ break;
+ case X86_VENDOR_CENTAUR:
+ early_init_centaur(c);
+ break;
+ }
+
+ validate_pat_support(c);
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ early_identify_cpu(c);
+
+ init_scattered_cpuid_features(c);
+
+ c->apicid = phys_pkg_id(0);
+
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+ * features a certain CPU supports which CPUID doesn't
+ * tell us, CPUID claiming incorrect flags, or other bugs,
+ * we handle them here.
+ *
+ * At the end of this section, c->x86_capability better
+ * indicate the features this CPU genuinely supports!
+ */
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ init_amd(c);
+ break;
+
+ case X86_VENDOR_INTEL:
+ init_intel(c);
+ break;
+
+ case X86_VENDOR_CENTAUR:
+ init_centaur(c);
+ break;
+
+ case X86_VENDOR_UNKNOWN:
+ default:
+ display_cacheinfo(c);
+ break;
+ }
+
+ detect_ht(c);
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+ * common between the CPUs. The first time this routine gets
+ * executed, c == &boot_cpu_data.
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+ for (i = 0; i < NCAPINTS; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+ }
+
+ /* Clear all flags overriden by options */
+ for (i = 0; i < NCAPINTS; i++)
+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
+#ifdef CONFIG_X86_MCE
+ mcheck_init(c);
+#endif
+ select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+#endif
+
+}
+
+void __cpuinit identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ mtrr_ap_init();
+}
+
+static __init int setup_noclflush(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+ return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+ if (c->x86_model_id[0])
+ printk(KERN_CONT "%s", c->x86_model_id);
+
+ if (c->x86_mask || c->cpuid_level >= 0)
+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+ else
+ printk(KERN_CONT "\n");
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+ int bit;
+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+ setup_clear_cpu_cap(bit);
+ else
+ return 0;
+ return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
--- /dev/null
+/*
+ * Intel SMP support routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <mach_ipi.h>
+#include <xen/evtchn.h>
+/*
+ * Some notes on x86 processor bugs affecting SMP operation:
+ *
+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ * The Linux implications for SMP are handled as follows:
+ *
+ * Pentium III / [Xeon]
+ * None of the E1AP-E3AP errata are visible to the user.
+ *
+ * E1AP. see PII A1AP
+ * E2AP. see PII A2AP
+ * E3AP. see PII A3AP
+ *
+ * Pentium II / [Xeon]
+ * None of the A1AP-A3AP errata are visible to the user.
+ *
+ * A1AP. see PPro 1AP
+ * A2AP. see PPro 2AP
+ * A3AP. see PPro 7AP
+ *
+ * Pentium Pro
+ * None of 1AP-9AP errata are visible to the normal user,
+ * except occasional delivery of 'spurious interrupt' as trap #15.
+ * This is very rare and a non-problem.
+ *
+ * 1AP. Linux maps APIC as non-cacheable
+ * 2AP. worked around in hardware
+ * 3AP. fixed in C0 and above steppings microcode update.
+ * Linux does not use excessive STARTUP_IPIs.
+ * 4AP. worked around in hardware
+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
+ * 'noapic' mode has vector 0xf filled out properly.
+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
+ * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
+ * 9AP. We do not use mixed mode
+ *
+ * Pentium
+ * There is a marginal case where REP MOVS on 100MHz SMP
+ * machines with B stepping processors can fail. XXX should provide
+ * an L1cache=Writethrough or L1cache=off option.
+ *
+ * B stepping CPUs may hang. There are hardware work arounds
+ * for this. We warn about it in case your board doesn't have the work
+ * arounds. Basically that's so I can tell anyone with a B stepping
+ * CPU and SMP problems "tough".
+ *
+ * Specific items [From Pentium Processor Specification Update]
+ *
+ * 1AP. Linux doesn't use remote read
+ * 2AP. Linux doesn't trust APIC errors
+ * 3AP. We work around this
+ * 4AP. Linux never generated 3 interrupts of the same priority
+ * to cause a lost local interrupt.
+ * 5AP. Remote read is never used
+ * 6AP. not affected - worked around in hardware
+ * 7AP. not affected - worked around in hardware
+ * 8AP. worked around in hardware - we get explicit CS errors if not
+ * 9AP. only 'noapic' mode affected. Might generate spurious
+ * interrupts, we log only the first one and count the
+ * rest silently.
+ * 10AP. not affected - worked around in hardware
+ * 11AP. Linux reads the APIC between writes to avoid this, as per
+ * the documentation. Make sure you preserve this as it affects
+ * the C stepping chips too.
+ * 12AP. not affected - worked around in hardware
+ * 13AP. not affected - worked around in hardware
+ * 14AP. we always deassert INIT during bootup
+ * 15AP. not affected - worked around in hardware
+ * 16AP. not affected - worked around in hardware
+ * 17AP. not affected - worked around in hardware
+ * 18AP. not affected - worked around in hardware
+ * 19AP. not affected - worked around in BIOS
+ *
+ * If this sounds worrying believe me these bugs are either ___RARE___,
+ * or are signal timing bugs worked around in hardware and there's
+ * about nothing of note with C stepping upwards.
+ */
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void xen_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN_ON(1);
+ return;
+ }
+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+ void (*func) (void *info);
+ void *info;
+ atomic_t started;
+ atomic_t finished;
+ int wait;
+};
+
+void lock_ipi_call_lock(void)
+{
+ spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+ spin_unlock_irq(&call_lock);
+}
+
+static struct call_data_struct *call_data;
+
+static void __smp_call_function(void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ struct call_data_struct data;
+ int cpus = num_online_cpus() - 1;
+
+ if (!cpus)
+ return;
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ mb();
+
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+}
+
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on. Must not include the current cpu.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int
+xen_smp_call_function_mask(cpumask_t mask,
+ void (*func)(void *), void *info,
+ int wait)
+{
+ struct call_data_struct data;
+ cpumask_t allbutself;
+ int cpus;
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
+ /* Holding any lock stops cpus from going down. */
+ spin_lock(&call_lock);
+
+ allbutself = cpu_online_map;
+ cpu_clear(smp_processor_id(), allbutself);
+
+ cpus_and(mask, mask, allbutself);
+ cpus = cpus_weight(mask);
+
+ if (!cpus) {
+ spin_unlock(&call_lock);
+ return 0;
+ }
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ wmb();
+
+ /* Send a message to other CPUs */
+ if (cpus_equal(mask, allbutself) &&
+ cpus_equal(cpu_online_map, cpu_callout_map))
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ else
+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+ spin_unlock(&call_lock);
+
+ return 0;
+}
+
+static void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ disable_all_local_evtchn();
+ if (hlt_works(smp_processor_id()))
+ for (;;) halt();
+ for (;;);
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+void xen_smp_send_stop(void)
+{
+ int nolock;
+ unsigned long flags;
+
+ /* Don't deadlock on the call lock in panic */
+ nolock = !spin_trylock(&call_lock);
+ local_irq_save(flags);
+ __smp_call_function(stop_this_cpu, NULL, 0, 0);
+ if (!nolock)
+ spin_unlock(&call_lock);
+ disable_all_local_evtchn();
+ local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
+{
+#ifdef CONFIG_X86_32
+ __get_cpu_var(irq_stat).irq_resched_count++;
+#else
+ add_pda(irq_resched_count, 1);
+#endif
+ return IRQ_HANDLED;
+}
+
+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
+{
+ void (*func) (void *info) = call_data->func;
+ void *info = call_data->info;
+ int wait = call_data->wait;
+
+ /*
+ * Notify initiating CPU that I've grabbed the data and am
+ * about to execute the function
+ */
+ mb();
+ atomic_inc(&call_data->started);
+ /*
+ * At this point the info structure may be out of scope unless wait==1
+ */
+ irq_enter();
+ (*func)(info);
+#ifdef CONFIG_X86_32
+ __get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
+ irq_exit();
+
+ if (wait) {
+ mb();
+ atomic_inc(&call_data->finished);
+ }
+
+ return IRQ_HANDLED;
+}
--- /dev/null
+/*
+ * Copyright (C) 1991, 1992, 1995 Linus Torvalds
+ *
+ * This file contains the PC-specific time handling details:
+ * reading the RTC at bootup, etc..
+ * 1994-07-02 Alan Modra
+ * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
+ * 1995-03-26 Markus Kuhn
+ * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
+ * precision CMOS clock update
+ * 1996-05-03 Ingo Molnar
+ * fixed time warps in do_[slow|fast]_gettimeoffset()
+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-09-05 (Various)
+ * More robust do_fast_gettimeoffset() algorithm implemented
+ * (works with APM, Cyrix 6x86MX and Centaur C6),
+ * monotonic gettimeofday() with fast_get_timeoffset(),
+ * drift-proof precision TSC calibration on boot
+ * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
+ * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
+ * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
+ * 1998-12-16 Andrea Arcangeli
+ * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
+ * because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
+ * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/mca.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <linux/posix-timers.h>
+#include <linux/cpufreq.h>
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+
+#include <asm/delay.h>
+#include <asm/time.h>
+
+#include <xen/evtchn.h>
+#include <xen/sysctl.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/i8253.h>
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+#ifdef CONFIG_X86_64
+#include <asm/vsyscall.h>
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+#endif
+
+#define XEN_SHIFT 22
+
+unsigned int cpu_khz; /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
+/* These are peridically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ u32 tsc_to_usec_mul;
+ int tsc_shift;
+ u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+static struct timespec shadow_tv;
+static u32 shadow_tv_version;
+
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time; /* System time (ns) at last processing. */
+static DEFINE_PER_CPU(u64, processed_system_time);
+
+/* How much CPU time was spent blocked and how much was 'stolen'? */
+static DEFINE_PER_CPU(u64, processed_stolen_time);
+static DEFINE_PER_CPU(u64, processed_blocked_time);
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
+#define NS_PER_TICK (1000000000LL/HZ)
+
+static void __clock_was_set(struct work_struct *unused)
+{
+ clock_was_set();
+}
+static DECLARE_WORK(clock_was_set_work, __clock_was_set);
+
+static inline void __normalize_time(time_t *sec, s64 *nsec)
+{
+ while (*nsec >= NSEC_PER_SEC) {
+ (*nsec) -= NSEC_PER_SEC;
+ (*sec)++;
+ }
+ while (*nsec < 0) {
+ (*nsec) += NSEC_PER_SEC;
+ (*sec)--;
+ }
+}
+
+/* Does this guest OS track Xen time, or set its wall clock independently? */
+static int independent_wallclock = 0;
+static int __init __independent_wallclock(char *str)
+{
+ independent_wallclock = 1;
+ return 1;
+}
+__setup("independent_wallclock", __independent_wallclock);
+
+int xen_independent_wallclock(void)
+{
+ return independent_wallclock;
+}
+
+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
+static int __init __permitted_clock_jitter(char *str)
+{
+ permitted_clock_jitter = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
+
+#if 0
+static void delay_tsc(unsigned long loops)
+{
+ unsigned long bclock, now;
+
+ rdtscl(bclock);
+ do {
+ rep_nop();
+ rdtscl(now);
+ } while ((now - bclock) < loops);
+}
+
+struct timer_opts timer_tsc = {
+ .name = "tsc",
+ .delay = delay_tsc,
+};
+#endif
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#else
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#endif
+
+ return product;
+}
+
+static inline u64 get64(volatile u64 *ptr)
+{
+#ifndef CONFIG_64BIT
+ return cmpxchg64(ptr, 0, 0);
+#else
+ return *ptr;
+#endif
+}
+
+static inline u64 get64_local(volatile u64 *ptr)
+{
+#ifndef CONFIG_64BIT
+ return cmpxchg64_local(ptr, 0, 0);
+#else
+ return *ptr;
+#endif
+}
+
+#if 0 /* defined (__i386__) */
+int read_current_timer(unsigned long *timer_val)
+{
+ rdtscl(*timer_val);
+ return 0;
+}
+#endif
+
+static void init_cpu_khz(void)
+{
+ u64 __cpu_khz = 1000000ULL << 32;
+ struct vcpu_time_info *info = &vcpu_info(0)->time;
+ do_div(__cpu_khz, info->tsc_to_system_mul);
+ if (info->tsc_shift < 0)
+ cpu_khz = __cpu_khz << -info->tsc_shift;
+ else
+ cpu_khz = __cpu_khz >> info->tsc_shift;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+ u64 now, delta;
+ rdtscll(now);
+ delta = now - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static void __update_wallclock(time_t sec, long nsec)
+{
+ long wtm_nsec, xtime_nsec;
+ time_t wtm_sec, xtime_sec;
+ u64 tmp, wc_nsec;
+
+ /* Adjust wall-clock time base. */
+ wc_nsec = processed_system_time;
+ wc_nsec += sec * (u64)NSEC_PER_SEC;
+ wc_nsec += nsec;
+
+ /* Split wallclock base into seconds and nanoseconds. */
+ tmp = wc_nsec;
+ xtime_nsec = do_div(tmp, 1000000000);
+ xtime_sec = (time_t)tmp;
+
+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
+
+ set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+ ntp_clear();
+}
+
+static void update_wallclock(void)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ do {
+ shadow_tv_version = s->wc_version;
+ rmb();
+ shadow_tv.tv_sec = s->wc_sec;
+ shadow_tv.tv_nsec = s->wc_nsec;
+ rmb();
+ } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
+
+ if (!independent_wallclock)
+ __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static void get_time_values_from_xen(unsigned int cpu)
+{
+ struct vcpu_time_info *src;
+ struct shadow_time_info *dst;
+
+ src = &vcpu_info(cpu)->time;
+ dst = &per_cpu(shadow_time, cpu);
+
+ do {
+ dst->version = src->version;
+ rmb();
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb();
+ } while ((src->version & 1) | (dst->version ^ src->version));
+
+ dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+}
+
+static inline int time_values_up_to_date(unsigned int cpu)
+{
+ struct vcpu_time_info *src;
+ struct shadow_time_info *dst;
+
+ src = &vcpu_info(cpu)->time;
+ dst = &per_cpu(shadow_time, cpu);
+
+ rmb();
+ return (dst->version == src->version);
+}
+
+static void sync_xen_wallclock(unsigned long dummy);
+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
+static void sync_xen_wallclock(unsigned long dummy)
+{
+ time_t sec;
+ s64 nsec;
+ struct xen_platform_op op;
+
+ BUG_ON(!is_initial_xendomain());
+ if (!ntp_synced() || independent_wallclock)
+ return;
+
+ write_seqlock_irq(&xtime_lock);
+
+ sec = xtime.tv_sec;
+ nsec = xtime.tv_nsec;
+ __normalize_time(&sec, &nsec);
+
+ op.cmd = XENPF_settime;
+ op.u.settime.secs = sec;
+ op.u.settime.nsecs = nsec;
+ op.u.settime.system_time = processed_system_time;
+ WARN_ON(HYPERVISOR_platform_op(&op));
+
+ update_wallclock();
+
+ write_sequnlock_irq(&xtime_lock);
+
+ /* Once per minute. */
+ mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
+}
+
+static unsigned long long local_clock(void)
+{
+ unsigned int cpu = get_cpu();
+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+ u64 time;
+ u32 local_time_version;
+
+ do {
+ local_time_version = shadow->version;
+ barrier();
+ time = shadow->system_timestamp + get_nsec_offset(shadow);
+ if (!time_values_up_to_date(cpu))
+ get_time_values_from_xen(cpu);
+ barrier();
+ } while (local_time_version != shadow->version);
+
+ put_cpu();
+
+ return time;
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+ u64 state_time;
+ struct vcpu_runstate_info *state;
+
+ BUG_ON(preemptible());
+
+ state = &__get_cpu_var(runstate);
+
+ do {
+ state_time = get64_local(&state->state_entry_time);
+ *res = *state;
+ } while (get64_local(&state->state_entry_time) != state_time);
+
+ WARN_ON_ONCE(res->state != RUNSTATE_running);
+}
+
+/*
+ * Xen sched_clock implementation. Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long sched_clock(void)
+{
+ struct vcpu_runstate_info runstate;
+ cycle_t now;
+ u64 ret;
+ s64 offset;
+
+ /*
+ * Ideally sched_clock should be called on a per-cpu basis
+ * anyway, so preempt should already be disabled, but that's
+ * not current practice at the moment.
+ */
+ preempt_disable();
+
+ now = local_clock();
+
+ get_runstate_snapshot(&runstate);
+
+ offset = now - runstate.state_entry_time;
+ if (offset < 0)
+ offset = 0;
+
+ ret = offset + runstate.time[RUNSTATE_running]
+ + runstate.time[RUNSTATE_blocked];
+
+ preempt_enable();
+
+ return ret;
+}
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+ unsigned long pc = instruction_pointer(regs);
+
+#if defined(CONFIG_SMP) || defined(__x86_64__)
+# ifdef __i386__
+ if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
+# else
+ if (!user_mode(regs)
+# endif
+ && in_lock_functions(pc)) {
+# ifdef CONFIG_FRAME_POINTER
+ return ((unsigned long *)regs->bp)[1];
+# else
+# ifdef __i386__
+ unsigned long *sp = (unsigned long *)®s->sp;
+# else
+ unsigned long *sp = (unsigned long *)regs->sp;
+# endif
+
+ /* Return address is either directly at stack pointer
+ or above a saved flags. Eflags has bits 22-31 zero,
+ kernel addresses don't. */
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
+# endif
+ }
+#endif
+
+ return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * This is the same as the above, except we _also_ save the current
+ * Time Stamp Counter value at the time of the timer interrupt, so that
+ * we later on can estimate the time of day more exactly.
+ */
+irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+ s64 delta, delta_cpu, stolen, blocked;
+ unsigned int i, cpu = smp_processor_id();
+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+ struct vcpu_runstate_info runstate;
+
+ /* Keep nmi watchdog up to date */
+#ifdef __i386__
+ per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
+#else
+ add_pda(irq0_irqs, 1);
+#endif
+
+ /*
+ * Here we are in the timer irq handler. We just have irqs locally
+ * disabled but we don't know if the timer_bh is running on the other
+ * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+ * the irq version of write_lock because as just said we have irq
+ * locally disabled. -arca
+ */
+ write_seqlock(&xtime_lock);
+
+ do {
+ get_time_values_from_xen(cpu);
+
+ /* Obtain a consistent snapshot of elapsed wallclock cycles. */
+ delta = delta_cpu =
+ shadow->system_timestamp + get_nsec_offset(shadow);
+ delta -= processed_system_time;
+ delta_cpu -= per_cpu(processed_system_time, cpu);
+
+ get_runstate_snapshot(&runstate);
+ } while (!time_values_up_to_date(cpu));
+
+ if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
+ unlikely(delta_cpu < -(s64)permitted_clock_jitter))
+ && printk_ratelimit()) {
+ printk("Timer ISR/%u: Time went backwards: "
+ "delta=%lld delta_cpu=%lld shadow=%lld "
+ "off=%lld processed=%lld cpu_processed=%lld\n",
+ cpu, delta, delta_cpu, shadow->system_timestamp,
+ (s64)get_nsec_offset(shadow),
+ processed_system_time,
+ per_cpu(processed_system_time, cpu));
+ for (i = 0; i < num_online_cpus(); i++)
+ printk(" %d: %lld\n", i,
+ per_cpu(processed_system_time, i));
+ }
+
+ /* System-wide jiffy work. */
+ if (delta >= NS_PER_TICK) {
+ do_div(delta, NS_PER_TICK);
+ processed_system_time += delta * NS_PER_TICK;
+ while (delta > HZ) {
+ do_timer(HZ);
+ delta -= HZ;
+ }
+ do_timer(delta);
+ }
+
+ if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
+ update_wallclock();
+ if (keventd_up())
+ schedule_work(&clock_was_set_work);
+ }
+
+ write_sequnlock(&xtime_lock);
+
+ /*
+ * Account stolen ticks.
+ * HACK: Passing NULL to account_steal_time()
+ * ensures that the ticks are accounted as stolen.
+ */
+ stolen = runstate.time[RUNSTATE_runnable]
+ + runstate.time[RUNSTATE_offline]
+ - per_cpu(processed_stolen_time, cpu);
+ if ((stolen > 0) && (delta_cpu > 0)) {
+ delta_cpu -= stolen;
+ if (unlikely(delta_cpu < 0))
+ stolen += delta_cpu; /* clamp local-time progress */
+ do_div(stolen, NS_PER_TICK);
+ per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
+ per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
+ account_steal_time(NULL, (cputime_t)stolen);
+ }
+
+ /*
+ * Account blocked ticks.
+ * HACK: Passing idle_task to account_steal_time()
+ * ensures that the ticks are accounted as idle/wait.
+ */
+ blocked = runstate.time[RUNSTATE_blocked]
+ - per_cpu(processed_blocked_time, cpu);
+ if ((blocked > 0) && (delta_cpu > 0)) {
+ delta_cpu -= blocked;
+ if (unlikely(delta_cpu < 0))
+ blocked += delta_cpu; /* clamp local-time progress */
+ do_div(blocked, NS_PER_TICK);
+ per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
+ per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
+ account_steal_time(idle_task(cpu), (cputime_t)blocked);
+ }
+
+ /* Account user/system ticks. */
+ if (delta_cpu > 0) {
+ do_div(delta_cpu, NS_PER_TICK);
+ per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+ if (user_mode_vm(get_irq_regs()))
+ account_user_time(current, (cputime_t)delta_cpu);
+ else
+ account_system_time(current, HARDIRQ_OFFSET,
+ (cputime_t)delta_cpu);
+ }
+
+ /* Offlined for more than a few seconds? Avoid lockup warnings. */
+ if (stolen > 5*HZ)
+ touch_softlockup_watchdog();
+
+ /* Local timer processing (see update_process_times()). */
+ run_local_timers();
+ if (rcu_pending(cpu))
+ rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs()));
+ scheduler_tick();
+ run_posix_cpu_timers(current);
+ profile_tick(CPU_PROFILING);
+
+ return IRQ_HANDLED;
+}
+
+void mark_tsc_unstable(char *reason)
+{
+#ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
+ tsc_unstable = 1;
+#endif
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static cycle_t cs_last;
+
+static cycle_t xen_clocksource_read(void)
+{
+ cycle_t ret = local_clock();
+
+#ifdef CONFIG_SMP
+ for (;;) {
+ cycle_t last = get64(&cs_last);
+
+ if ((s64)(ret - last) < 0) {
+ if (last - ret > permitted_clock_jitter
+ && printk_ratelimit()) {
+ unsigned int cpu = get_cpu();
+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+
+ printk(KERN_WARNING "clocksource/%u: "
+ "Time went backwards: "
+ "ret=%Lx delta=%Ld shadow=%Lx offset=%Lx\n",
+ cpu, ret, ret - last,
+ shadow->system_timestamp,
+ get_nsec_offset(shadow));
+ put_cpu();
+ }
+ ret = last;
+ }
+ if (cmpxchg64(&cs_last, last, ret) == last)
+ break;
+ }
+#endif
+
+ return ret;
+}
+
+static void xen_clocksource_resume(void)
+{
+ extern void time_resume(void);
+
+ time_resume();
+ cs_last = local_clock();
+}
+
+static struct clocksource clocksource_xen = {
+ .name = "xen",
+ .rating = 400,
+ .read = xen_clocksource_read,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */
+ .shift = XEN_SHIFT,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .resume = xen_clocksource_resume,
+};
+
+static void init_missing_ticks_accounting(unsigned int cpu)
+{
+ struct vcpu_register_runstate_memory_area area;
+ struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+ int rc;
+
+ memset(runstate, 0, sizeof(*runstate));
+
+ area.addr.v = runstate;
+ rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+ WARN_ON(rc && rc != -ENOSYS);
+
+ per_cpu(processed_blocked_time, cpu) =
+ runstate->time[RUNSTATE_blocked];
+ per_cpu(processed_stolen_time, cpu) =
+ runstate->time[RUNSTATE_runnable] +
+ runstate->time[RUNSTATE_offline];
+}
+
+unsigned long xen_read_persistent_clock(void)
+{
+ const shared_info_t *s = HYPERVISOR_shared_info;
+ u32 version, sec, nsec;
+ u64 delta;
+
+ do {
+ version = s->wc_version;
+ rmb();
+ sec = s->wc_sec;
+ nsec = s->wc_nsec;
+ rmb();
+ } while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+ delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
+ do_div(delta, NSEC_PER_SEC);
+
+ return delta;
+}
+
+int xen_update_persistent_clock(void)
+{
+ if (!is_initial_xendomain())
+ return -1;
+ mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
+ return 0;
+}
+
+/* Dynamically-mapped IRQ. */
+DEFINE_PER_CPU(int, timer_irq);
+
+static void setup_cpu0_timer_irq(void)
+{
+ per_cpu(timer_irq, 0) =
+ bind_virq_to_irqhandler(
+ VIRQ_TIMER,
+ 0,
+ timer_interrupt,
+ IRQF_DISABLED|IRQF_NOBALANCING,
+ "timer0",
+ NULL);
+ BUG_ON(per_cpu(timer_irq, 0) < 0);
+}
+
+static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
+ .period_ns = NS_PER_TICK
+};
+
+void __init time_init(void)
+{
+ init_cpu_khz();
+ printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
+ cpu_khz / 1000, cpu_khz % 1000);
+
+ switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
+ &xen_set_periodic_tick)) {
+ case 0:
+#if CONFIG_XEN_COMPAT <= 0x030004
+ case -ENOSYS:
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ get_time_values_from_xen(0);
+
+ processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+ per_cpu(processed_system_time, 0) = processed_system_time;
+ init_missing_ticks_accounting(0);
+
+ clocksource_register(&clocksource_xen);
+
+ update_wallclock();
+
+#ifndef CONFIG_X86_64
+ use_tsc_delay();
+#endif
+
+ /* Cannot request_irq() until kmem is initialised. */
+ late_time_init = setup_cpu0_timer_irq;
+}
+
+/* Convert jiffies to system time. */
+u64 jiffies_to_st(unsigned long j)
+{
+ unsigned long seq;
+ long delta;
+ u64 st;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ delta = j - jiffies;
+ if (delta < 1) {
+ /* Triggers in some wrap-around cases, but that's okay:
+ * we just end up with a shorter timeout. */
+ st = processed_system_time + NS_PER_TICK;
+ } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
+ /* Very long timeout means there is no pending timer.
+ * We indicate this to Xen by passing zero timeout. */
+ st = 0;
+ } else {
+ st = processed_system_time + delta * (u64)NS_PER_TICK;
+ }
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return st;
+}
+EXPORT_SYMBOL(jiffies_to_st);
+
+/*
+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
+ * These functions are based on implementations from arch/s390/kernel/time.c
+ */
+static void stop_hz_timer(void)
+{
+ struct vcpu_set_singleshot_timer singleshot;
+ unsigned int cpu = smp_processor_id();
+ unsigned long j;
+ int rc;
+
+ cpu_set(cpu, nohz_cpu_mask);
+
+ /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
+ /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
+ /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
+ /* stop the hz timer then the cpumasks created for subsequent values */
+ /* of cur in rcu_start_batch are guaranteed to pick up the updated */
+ /* nohz_cpu_mask and so will not depend on this cpu. */
+
+ smp_mb();
+
+ /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
+ if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
+ (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ j = jiffies + 1;
+ }
+
+ singleshot.timeout_abs_ns = jiffies_to_st(j);
+ singleshot.flags = 0;
+ rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
+#if CONFIG_XEN_COMPAT <= 0x030004
+ if (rc) {
+ BUG_ON(rc != -ENOSYS);
+ rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
+ }
+#endif
+ BUG_ON(rc);
+}
+
+static void start_hz_timer(void)
+{
+ cpu_clear(smp_processor_id(), nohz_cpu_mask);
+}
+
+void xen_safe_halt(void)
+{
+ stop_hz_timer();
+ /* Blocking includes an implicit local_irq_enable(). */
+ HYPERVISOR_block();
+ start_hz_timer();
+}
+EXPORT_SYMBOL(xen_safe_halt);
+
+void xen_halt(void)
+{
+ if (irqs_disabled())
+ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
+}
+EXPORT_SYMBOL(xen_halt);
+
+/* No locking required. Interrupts are disabled on all CPUs. */
+void time_resume(void)
+{
+ unsigned int cpu;
+
+ init_cpu_khz();
+
+ for_each_online_cpu(cpu) {
+ switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
+ &xen_set_periodic_tick)) {
+ case 0:
+#if CONFIG_XEN_COMPAT <= 0x030004
+ case -ENOSYS:
+#endif
+ break;
+ default:
+ BUG();
+ }
+ get_time_values_from_xen(cpu);
+ per_cpu(processed_system_time, cpu) =
+ per_cpu(shadow_time, 0).system_timestamp;
+ init_missing_ticks_accounting(cpu);
+ }
+
+ processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+
+ update_wallclock();
+}
+
+#ifdef CONFIG_SMP
+static char timer_name[NR_CPUS][15];
+
+int __cpuinit local_setup_timer(unsigned int cpu)
+{
+ int seq, irq;
+
+ BUG_ON(cpu == 0);
+
+ switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
+ &xen_set_periodic_tick)) {
+ case 0:
+#if CONFIG_XEN_COMPAT <= 0x030004
+ case -ENOSYS:
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
+ per_cpu(processed_system_time, cpu) =
+ per_cpu(shadow_time, 0).system_timestamp;
+ init_missing_ticks_accounting(cpu);
+ } while (read_seqretry(&xtime_lock, seq));
+
+ sprintf(timer_name[cpu], "timer%u", cpu);
+ irq = bind_virq_to_irqhandler(VIRQ_TIMER,
+ cpu,
+ timer_interrupt,
+ IRQF_DISABLED|IRQF_NOBALANCING,
+ timer_name[cpu],
+ NULL);
+ if (irq < 0)
+ return irq;
+ per_cpu(timer_irq, cpu) = irq;
+
+ return 0;
+}
+
+void __cpuexit local_teardown_timer(unsigned int cpu)
+{
+ BUG_ON(cpu == 0);
+ unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
+}
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct xen_platform_op op;
+
+ if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ if (val == CPUFREQ_PRECHANGE)
+ return 0;
+
+ op.cmd = XENPF_change_freq;
+ op.u.change_freq.flags = 0;
+ op.u.change_freq.cpu = freq->cpu;
+ op.u.change_freq.freq = (u64)freq->new * 1000;
+ WARN_ON(HYPERVISOR_platform_op(&op));
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_time_setup(void)
+{
+ if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER)) {
+ printk(KERN_ERR "failed to set up cpufreq notifier\n");
+ return -ENODEV;
+ }
+ return 0;
+}
+
+core_initcall(cpufreq_time_setup);
+#endif
+
+/*
+ * /proc/sys/xen: This really belongs in another file. It can stay here for
+ * now however.
+ */
+static ctl_table xen_subtable[] = {
+ {
+ .ctl_name = CTL_XEN_INDEPENDENT_WALLCLOCK,
+ .procname = "independent_wallclock",
+ .data = &independent_wallclock,
+ .maxlen = sizeof(independent_wallclock),
+ .mode = 0644,
+ .strategy = sysctl_data,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .ctl_name = CTL_XEN_PERMITTED_CLOCK_JITTER,
+ .procname = "permitted_clock_jitter",
+ .data = &permitted_clock_jitter,
+ .maxlen = sizeof(permitted_clock_jitter),
+ .mode = 0644,
+ .strategy = sysctl_data,
+ .proc_handler = proc_doulongvec_minmax
+ },
+ { }
+};
+static ctl_table xen_table[] = {
+ {
+ .ctl_name = CTL_XEN,
+ .procname = "xen",
+ .mode = 0555,
+ .child = xen_subtable
+ },
+ { }
+};
+static int __init xen_sysctl_init(void)
+{
+ (void)register_sysctl_table(xen_table);
+ return 0;
+}
+__initcall(xen_sysctl_init);
--- /dev/null
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'asm.s'.
+ */
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/unwind.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+
+#ifdef CONFIG_EISA
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#endif
+
+#ifdef CONFIG_MCA
+#include <linux/mca.h>
+#endif
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/arch_hooks.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unwind.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/io.h>
+
+#include "mach_traps.h"
+
+int panic_on_unrecovered_nmi;
+
+#ifndef CONFIG_XEN
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
+#endif
+
+asmlinkage int system_call(void);
+
+/* Do we ignore FPU interrupts ? */
+char ignore_fpu_irq;
+
+#ifndef CONFIG_X86_NO_IDT
+/*
+ * The IDT has to be page-aligned to simplify the Pentium
+ * F0 0F bug workaround.. We have a special link segment
+ * for this.
+ */
+gate_desc idt_table[256]
+ __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
+#endif
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void alignment_check(void);
+#ifndef CONFIG_XEN
+asmlinkage void spurious_interrupt_bug(void);
+#else
+asmlinkage void fixup_4gb_segment(void);
+#endif
+asmlinkage void machine_check(void);
+
+int kstack_depth_to_print = 24;
+static unsigned int code_bytes = 64;
+#ifdef CONFIG_STACK_UNWIND
+static int call_trace = 1;
+#else
+#define call_trace (-1)
+#endif
+
+void printk_address(unsigned long address, int reliable)
+{
+#ifdef CONFIG_KALLSYMS
+ char namebuf[KSYM_NAME_LEN];
+ unsigned long offset = 0;
+ unsigned long symsize;
+ const char *symname;
+ char reliab[4] = "";
+ char *delim = ":";
+ char *modname;
+
+ symname = kallsyms_lookup(address, &symsize, &offset,
+ &modname, namebuf);
+ if (!symname) {
+ printk(" [<%08lx>]\n", address);
+ return;
+ }
+ if (!reliable)
+ strcpy(reliab, "? ");
+
+ if (!modname)
+ modname = delim = "";
+ printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+ address, reliab, delim, modname, delim, symname, offset, symsize);
+#else
+ printk(" [<%08lx>]\n", address);
+#endif
+}
+
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+{
+ return p > (void *)tinfo &&
+ p <= (void *)tinfo + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
+ unsigned long addr;
+
+ addr = *stack;
+ if (__kernel_text_address(addr)) {
+ if ((unsigned long) stack == bp + 4) {
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ bp = (unsigned long) frame;
+ } else {
+ ops->address(data, addr, bp == 0);
+ }
+ }
+ stack++;
+ }
+ return bp;
+}
+
+struct ops_and_data {
+ const struct stacktrace_ops *ops;
+ void *data;
+};
+
+static asmlinkage int
+dump_trace_unwind(struct unwind_frame_info *info, void *data)
+{
+ struct ops_and_data *oad = (struct ops_and_data *)data;
+ int n = 0;
+ unsigned long sp = UNW_SP(info);
+
+ if (arch_unw_user_mode(info))
+ return -1;
+ while (unwind(info) == 0 && UNW_PC(info)) {
+ n++;
+ oad->ops->address(oad->data, UNW_PC(info), 1);
+ if (arch_unw_user_mode(info))
+ break;
+ if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+ && sp > UNW_SP(info))
+ break;
+ sp = UNW_SP(info);
+ }
+ return n;
+}
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+{
+ if (!task)
+ task = current;
+
+ if (call_trace >= 0) {
+ int unw_ret = 0;
+ struct unwind_frame_info info;
+ struct ops_and_data oad = { .ops = ops, .data = data };
+
+ if (regs) {
+ if (unwind_init_frame_info(&info, task, regs) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ } else if (task == current)
+ unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+ else {
+ if (unwind_init_blocked(&info, task) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ }
+ if (unw_ret > 0) {
+ if (call_trace == 1 && !arch_unw_user_mode(&info)) {
+ ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+ UNW_PC(&info));
+ if (UNW_SP(&info) >= PAGE_OFFSET) {
+ ops->warning(data, "Leftover inexact backtrace:\n");
+ stack = (void *)UNW_SP(&info);
+ if (!stack)
+ return;
+ bp = UNW_FP(&info);
+ } else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else if (call_trace >= 1)
+ return;
+ else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else
+ ops->warning(data, "Inexact backtrace:\n");
+ }
+
+ if (!stack) {
+ unsigned long dummy;
+
+ stack = &dummy;
+ if (task != current)
+ stack = (unsigned long *)task->thread.sp;
+ }
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp) {
+ if (task == current) {
+ /* Grab bp right from our regs */
+ asm("movl %%ebp, %0" : "=r" (bp) :);
+ } else {
+ /* bp is the last reg pushed by switch_to */
+ bp = *(unsigned long *) task->thread.sp;
+ }
+ }
+#endif
+
+ while (1) {
+ struct thread_info *context;
+
+ context = (struct thread_info *)
+ ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+ bp = print_context_stack(context, stack, bp, ops, data);
+ /*
+ * Should be after the line below, but somewhere
+ * in early boot context comes out corrupted and we
+ * can't reference it:
+ */
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+ stack = (unsigned long *)context->previous_esp;
+ if (!stack)
+ break;
+ touch_nmi_watchdog();
+ }
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+ printk("%s [<%08lx>] ", (char *)data, addr);
+ if (!reliable)
+ printk("? ");
+ print_symbol("%s\n", addr);
+ touch_nmi_watchdog();
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ printk("%s =======================\n", log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
+{
+ show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+static void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl)
+{
+ unsigned long *stack;
+ int i;
+
+ if (sp == NULL) {
+ if (task)
+ sp = (unsigned long *)task->thread.sp;
+ else
+ sp = (unsigned long *)&sp;
+ }
+
+ stack = sp;
+ for (i = 0; i < kstack_depth_to_print; i++) {
+ if (kstack_end(stack))
+ break;
+ if (i && ((i % 8) == 0))
+ printk("\n%s ", log_lvl);
+ printk("%08lx ", *stack++);
+ }
+ printk("\n%sCall Trace:\n", log_lvl);
+
+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ printk(" ");
+ show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+ unsigned long bp = 0;
+ unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+ asm("movl %%ebp, %0" : "=r" (bp):);
+#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ show_trace(current, NULL, &stack, bp);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+ int i;
+
+ print_modules();
+ __show_registers(regs, 0);
+
+ printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
+ TASK_COMM_LEN, current->comm, task_pid_nr(current),
+ current_thread_info(), current, task_thread_info(current));
+ /*
+ * When in-kernel, we also print out the stack and code at the
+ * time of the fault..
+ */
+ if (!user_mode_vm(regs)) {
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+ unsigned char c;
+ u8 *ip;
+
+ printk("\n" KERN_EMERG "Stack: ");
+ show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG);
+
+ printk(KERN_EMERG "Code: ");
+
+ ip = (u8 *)regs->ip - code_prologue;
+ if (ip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(ip, c)) {
+ /* try starting at EIP */
+ ip = (u8 *)regs->ip;
+ code_len = code_len - code_prologue + 1;
+ }
+ for (i = 0; i < code_len; i++, ip++) {
+ if (ip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(ip, c)) {
+ printk(" Bad EIP value.");
+ break;
+ }
+ if (ip == (u8 *)regs->ip)
+ printk("<%02x> ", c);
+ else
+ printk("%02x ", c);
+ }
+ }
+ printk("\n");
+}
+
+int is_valid_bugaddr(unsigned long ip)
+{
+ unsigned short ud2;
+
+ if (ip < PAGE_OFFSET)
+ return 0;
+ if (probe_kernel_address((unsigned short *)ip, ud2))
+ return 0;
+
+ return ud2 == 0x0b0f;
+}
+
+static int die_counter;
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned short ss;
+ unsigned long sp;
+
+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC");
+#endif
+ printk("\n");
+
+ sysfs_printk_last_file();
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
+
+ show_registers(regs);
+ /* Executive summary in case the oops scrolled away */
+ sp = (unsigned long) (®s->sp);
+ savesegment(ss, ss);
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
+ }
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ static struct {
+ raw_spinlock_t lock;
+ u32 lock_owner;
+ int lock_owner_depth;
+ } die = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED,
+ .lock_owner = -1,
+ .lock_owner_depth = 0
+ };
+ unsigned long flags;
+
+ oops_enter();
+
+ if (die.lock_owner != raw_smp_processor_id()) {
+ console_verbose();
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&die.lock);
+ die.lock_owner = smp_processor_id();
+ die.lock_owner_depth = 0;
+ bust_spinlocks(1);
+ } else {
+ raw_local_irq_save(flags);
+ }
+
+ if (++die.lock_owner_depth < 3) {
+ report_bug(regs->ip, regs);
+
+ if (__die(str, regs, err))
+ regs = NULL;
+ } else {
+ printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+ }
+
+ bust_spinlocks(0);
+ die.lock_owner = -1;
+ add_taint(TAINT_DIE);
+ __raw_spin_unlock(&die.lock);
+ raw_local_irq_restore(flags);
+
+ if (!regs)
+ return;
+
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
+
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+
+ if (panic_on_oops)
+ panic("Fatal exception");
+
+ oops_exit();
+ do_exit(SIGSEGV);
+}
+
+static inline void
+die_if_kernel(const char *str, struct pt_regs *regs, long err)
+{
+ if (!user_mode_vm(regs))
+ die(str, regs, err);
+}
+
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+
+ if (regs->flags & X86_VM_MASK) {
+ if (vm86)
+ goto vm86_trap;
+ goto trap_signal;
+ }
+
+ if (!user_mode(regs))
+ goto kernel_trap;
+
+trap_signal:
+ /*
+ * We want error_code and trap_no set for userspace faults and
+ * kernelspace faults which result in die(), but not
+ * kernelspace faults which are fixed up. die() gives the
+ * process no chance to handle the signal and notice the
+ * kernel fault information, so that won't result in polluting
+ * the information about previously queued, but not yet
+ * delivered, faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+
+ if (info)
+ force_sig_info(signr, info, tsk);
+ else
+ force_sig(signr, tsk);
+ return;
+
+kernel_trap:
+ if (!fixup_exception(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+ die(str, regs, error_code);
+ }
+ return;
+
+vm86_trap:
+ if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ goto trap_signal;
+ return;
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ trace_hardirqs_fixup(); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ siginfo_t info; \
+ if (irq) \
+ local_irq_enable(); \
+ info.si_signo = signr; \
+ info.si_errno = 0; \
+ info.si_code = sicode; \
+ info.si_addr = (void __user *)siaddr; \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+}
+
+#define DO_VM86_ERROR(trapnr, signr, str, name) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
+}
+
+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ siginfo_t info; \
+ info.si_signo = signr; \
+ info.si_errno = 0; \
+ info.si_code = sicode; \
+ info.si_addr = (void __user *)siaddr; \
+ trace_hardirqs_fixup(); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+}
+
+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+#ifndef CONFIG_KPROBES
+DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
+#endif
+DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+
+void __kprobes do_general_protection(struct pt_regs * regs,
+ long error_code)
+{
+ struct thread_struct *thread;
+
+ thread = ¤t->thread;
+
+ if (regs->flags & X86_VM_MASK)
+ goto gp_in_vm86;
+
+ if (!user_mode(regs))
+ goto gp_in_kernel;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_no = 13;
+
+ if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ current->comm, task_pid_nr(current),
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ force_sig(SIGSEGV, current);
+ return;
+
+gp_in_vm86:
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ return;
+
+gp_in_kernel:
+ if (!fixup_exception(regs)) {
+ current->thread.error_code = error_code;
+ current->thread.trap_no = 13;
+ if (notify_die(DIE_GPF, "general protection fault", regs,
+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
+ return;
+ die("general protection fault", regs, error_code);
+ }
+}
+
+static notrace __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
+{
+ printk(KERN_EMERG
+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ printk(KERN_EMERG
+ "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+ if (edac_handler_set()) {
+ edac_atomic_assert_error();
+ return;
+ }
+#endif
+
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+
+ /* Clear and disable the memory parity error line. */
+ clear_mem_error(reason);
+}
+
+static notrace __kprobes void
+io_check_error(unsigned char reason, struct pt_regs *regs)
+{
+ printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ clear_io_check_error(reason);
+}
+
+static notrace __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+{
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ return;
+#ifdef CONFIG_MCA
+ /*
+ * Might actually be able to figure out what the guilty party
+ * is:
+ */
+ if (MCA_bus) {
+ mca_handle_nmi();
+ return;
+ }
+#endif
+ printk(KERN_EMERG
+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+}
+
+static DEFINE_SPINLOCK(nmi_print_lock);
+
+void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+{
+ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ spin_lock(&nmi_print_lock);
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out:
+ */
+ bust_spinlocks(1);
+ printk(KERN_EMERG "%s", msg);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
+ show_registers(regs);
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+ bust_spinlocks(0);
+
+ /*
+ * If we are in kernel we are probably nested up pretty bad
+ * and might aswell get out now while we still can:
+ */
+ if (!user_mode_vm(regs)) {
+ current->thread.trap_no = 2;
+ crash_kexec(regs);
+ }
+
+ do_exit(SIGSEGV);
+}
+
+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+{
+ unsigned char reason = 0;
+
+ /* Only the BSP gets external NMIs from the system: */
+ if (!smp_processor_id())
+ reason = get_nmi_reason();
+
+ if (!(reason & 0xc0)) {
+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP)
+ return;
+#ifdef CONFIG_X86_LOCAL_APIC
+#ifndef CONFIG_XEN
+ /*
+ * Ok, so this is none of the documented NMI sources,
+ * so it must be the NMI watchdog.
+ */
+ if (nmi_watchdog_tick(regs, reason))
+ return;
+#endif
+ if (!do_nmi_callback(regs, smp_processor_id()))
+ unknown_nmi_error(reason, regs);
+#else
+ unknown_nmi_error(reason, regs);
+#endif
+
+ return;
+ }
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ return;
+ if (reason & 0x80)
+ mem_parity_error(reason, regs);
+ if (reason & 0x40)
+ io_check_error(reason, regs);
+ /*
+ * Reassert NMI in case it became active meanwhile
+ * as it's edge-triggered:
+ */
+ reassert_nmi();
+}
+
+static int ignore_nmis;
+
+notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
+{
+ int cpu;
+
+ nmi_enter();
+
+ cpu = smp_processor_id();
+
+ ++nmi_count(cpu);
+
+ if (!ignore_nmis)
+ default_do_nmi(regs);
+
+ nmi_exit();
+}
+
+void stop_nmi(void)
+{
+ acpi_nmi_disable();
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+ acpi_nmi_enable();
+}
+
+#ifdef CONFIG_KPROBES
+void __kprobes do_int3(struct pt_regs *regs, long error_code)
+{
+ trace_hardirqs_fixup();
+
+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+ == NOTIFY_STOP)
+ return;
+ /*
+ * This is an interrupt gate, because kprobes wants interrupts
+ * disabled. Normal trap handlers don't.
+ */
+ restore_interrupts(regs);
+
+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+}
+#endif
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ *
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ */
+void __kprobes do_debug(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk = current;
+ unsigned int condition;
+
+ trace_hardirqs_fixup();
+
+ get_debugreg(condition, 6);
+
+ /*
+ * The processor cleared BTF, so don't mark that we need it set.
+ */
+ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+ tsk->thread.debugctlmsr = 0;
+
+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+ SIGTRAP) == NOTIFY_STOP)
+ return;
+ /* It's safe to allow irq's after DR6 has been saved */
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+
+ /* Mask out spurious debug traps due to lazy DR7 setting */
+ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+ if (!tsk->thread.debugreg7)
+ goto clear_dr7;
+ }
+
+ if (regs->flags & X86_VM_MASK)
+ goto debug_vm86;
+
+ /* Save debug status register where ptrace can see it */
+ tsk->thread.debugreg6 = condition;
+
+ /*
+ * Single-stepping through TF: make sure we ignore any events in
+ * kernel space (but re-enable TF when returning to user mode).
+ */
+ if (condition & DR_STEP) {
+ /*
+ * We already checked v86 mode above, so we can
+ * check for kernel mode by just checking the CPL
+ * of CS.
+ */
+ if (!user_mode(regs))
+ goto clear_TF_reenable;
+ }
+
+ /* Ok, finally something we can handle */
+ send_sigtrap(tsk, regs, error_code);
+
+ /*
+ * Disable additional traps. They'll be re-enabled when
+ * the signal is delivered.
+ */
+clear_dr7:
+ set_debugreg(0, 7);
+ return;
+
+debug_vm86:
+ handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
+ return;
+
+clear_TF_reenable:
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+ regs->flags &= ~X86_EFLAGS_TF;
+ return;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+void math_error(void __user *ip)
+{
+ struct task_struct *task;
+ unsigned short cwd;
+ unsigned short swd;
+ siginfo_t info;
+
+ /*
+ * Save the info for the exception handler and clear the error.
+ */
+ task = current;
+ save_init_fpu(task);
+ task->thread.trap_no = 16;
+ task->thread.error_code = 0;
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+ info.si_addr = ip;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+ * C1 reg you need in case of a stack fault, 0x040 is the stack
+ * fault bit. We should only be taking one exception at a time,
+ * so if this combination doesn't produce any single exception,
+ * then we have a bad program that isn't syncronizing its FPU usage
+ * and it will suffer the consequences since we won't be able to
+ * fully reproduce the context of the exception
+ */
+ cwd = get_fpu_cwd(task);
+ swd = get_fpu_swd(task);
+ switch (swd & ~cwd & 0x3f) {
+ case 0x000: /* No unmasked exception */
+ return;
+ default: /* Multiple exceptions */
+ break;
+ case 0x001: /* Invalid Op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
+ }
+ force_sig_info(SIGFPE, &info, task);
+}
+
+void do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+ ignore_fpu_irq = 1;
+ math_error((void __user *)regs->ip);
+}
+
+static void simd_math_error(void __user *ip)
+{
+ struct task_struct *task;
+ unsigned short mxcsr;
+ siginfo_t info;
+
+ /*
+ * Save the info for the exception handler and clear the error.
+ */
+ task = current;
+ save_init_fpu(task);
+ task->thread.trap_no = 19;
+ task->thread.error_code = 0;
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+ info.si_addr = ip;
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+ * unmasked exception was caught we must mask the exception mask bits
+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+ */
+ mxcsr = get_fpu_mxcsr(task);
+ switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+ case 0x000:
+ default:
+ break;
+ case 0x001: /* Invalid Op */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
+ }
+ force_sig_info(SIGFPE, &info, task);
+}
+
+void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+ if (cpu_has_xmm) {
+ /* Handle SIMD FPU exceptions on PIII+ processors. */
+ ignore_fpu_irq = 1;
+ simd_math_error((void __user *)regs->ip);
+ return;
+ }
+ /*
+ * Handle strange cache flush from user space exception
+ * in all other cases. This is undocumented behaviour.
+ */
+ if (regs->flags & X86_VM_MASK) {
+ handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
+ return;
+ }
+ current->thread.trap_no = 19;
+ current->thread.error_code = error_code;
+ die_if_kernel("cache flush denied", regs, error_code);
+ force_sig(SIGSEGV, current);
+}
+
+#ifndef CONFIG_XEN
+void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+#if 0
+ /* No need to warn about this any longer. */
+ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+#endif
+}
+
+unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
+{
+ struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+ unsigned long base = (kesp - uesp) & -THREAD_SIZE;
+ unsigned long new_kesp = kesp - base;
+ unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
+ __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+
+ /* Set up base for espfix segment */
+ desc &= 0x00f0ff0000000000ULL;
+ desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+ ((((__u64)base) << 32) & 0xff00000000000000ULL) |
+ ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
+ (lim_pages & 0xffff);
+ *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+
+ return new_kesp;
+}
+#endif
+
+/*
+ * 'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled (in this case,
+ * local interrupts are disabled at the call-site in entry.S).
+ */
+asmlinkage void math_state_restore(void)
+{
+ struct thread_info *thread = current_thread_info();
+ struct task_struct *tsk = thread->task;
+
+ if (!tsk_used_math(tsk)) {
+ local_irq_enable();
+ /*
+ * does a slab alloc which can sleep
+ */
+ if (init_fpu(tsk)) {
+ /*
+ * ran out of memory!
+ */
+ do_group_exit(SIGKILL);
+ return;
+ }
+ local_irq_disable();
+ }
+
+ /* NB. 'clts' is done for us by Xen during virtual trap. */
+ restore_fpu(tsk);
+ thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
+ tsk->fpu_counter++;
+}
+EXPORT_SYMBOL_GPL(math_state_restore);
+
+#ifndef CONFIG_MATH_EMULATION
+
+asmlinkage void math_emulate(long arg)
+{
+ printk(KERN_EMERG
+ "math-emulation not enabled and no coprocessor found.\n");
+ printk(KERN_EMERG "killing %s.\n", current->comm);
+ force_sig(SIGFPE, current);
+ schedule();
+}
+
+#endif /* CONFIG_MATH_EMULATION */
+
+/*
+ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
+ * for those that specify <dpl>|4 in the second field.
+ */
+static const trap_info_t __cpuinitconst trap_table[] = {
+ { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
+ { 4, 3, __KERNEL_CS, (unsigned long)overflow },
+ { 5, 0, __KERNEL_CS, (unsigned long)bounds },
+ { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
+ { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
+ { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+ { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
+ { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
+ { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
+ { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
+ { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
+ { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
+ { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
+ { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
+#ifdef CONFIG_X86_MCE
+ { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
+#endif
+ { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
+ { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
+ { 0, 0, 0, 0 }
+};
+
+void __init trap_init(void)
+{
+ int ret;
+
+ ret = HYPERVISOR_set_trap_table(trap_table);
+ if (ret)
+ printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
+
+ if (cpu_has_fxsr) {
+ printk(KERN_INFO "Enabling fast FPU save and restore... ");
+ set_in_cr4(X86_CR4_OSFXSR);
+ printk("done.\n");
+ }
+ if (cpu_has_xmm) {
+ printk(KERN_INFO
+ "Enabling unmasked SIMD FPU exception support... ");
+ set_in_cr4(X86_CR4_OSXMMEXCPT);
+ printk("done.\n");
+ }
+
+ init_thread_xstate();
+ /*
+ * Should be a barrier for any external CPU state:
+ */
+ cpu_init();
+}
+
+void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
+{
+ const trap_info_t *t = trap_table;
+
+ for (t = trap_table; t->address; t++) {
+ trap_ctxt[t->vector].flags = t->flags;
+ trap_ctxt[t->vector].cs = t->cs;
+ trap_ctxt[t->vector].address = t->address;
+ }
+}
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
+ return 1;
+}
+__setup("kstack=", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
+
+#ifdef CONFIG_STACK_UNWIND
+static int __init call_trace_setup(char *s)
+{
+ if (strcmp(s, "old") == 0)
+ call_trace = -1;
+ else if (strcmp(s, "both") == 0)
+ call_trace = 0;
+ else if (strcmp(s, "newfallback") == 0)
+ call_trace = 1;
+ else if (strcmp(s, "new") == 2)
+ call_trace = 2;
+ return 1;
+}
+__setup("call_trace=", call_trace_setup);
+#endif
--- /dev/null
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'entry.S'.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/nmi.h>
+#include <linux/kprobes.h>
+#include <linux/kexec.h>
+#include <linux/unwind.h>
+#include <linux/uaccess.h>
+#include <linux/bug.h>
+#include <linux/kdebug.h>
+#include <linux/utsname.h>
+
+#include <mach_traps.h>
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/processor.h>
+#include <asm/unwind.h>
+#include <asm/smp.h>
+#include <asm/pgalloc.h>
+#include <asm/pda.h>
+#include <asm/proto.h>
+#include <asm/nmi.h>
+#include <asm/stacktrace.h>
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void double_fault(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void reserved(void);
+asmlinkage void alignment_check(void);
+asmlinkage void machine_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+
+static unsigned int code_bytes = 64;
+
+static inline void conditional_sti(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void preempt_conditional_sti(struct pt_regs *regs)
+{
+ inc_preempt_count();
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void preempt_conditional_cli(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+ /* Make sure to not schedule here because we could be running
+ on an exception stack. */
+ dec_preempt_count();
+}
+
+int kstack_depth_to_print = 12;
+#ifdef CONFIG_STACK_UNWIND
+static int call_trace = 1;
+#else
+#define call_trace (-1)
+#endif
+
+void printk_address(unsigned long address, int reliable)
+{
+#ifdef CONFIG_KALLSYMS
+ unsigned long offset = 0, symsize;
+ const char *symname;
+ char *modname;
+ char *delim = ":";
+ char namebuf[KSYM_NAME_LEN];
+ char reliab[4] = "";
+
+ symname = kallsyms_lookup(address, &symsize, &offset,
+ &modname, namebuf);
+ if (!symname) {
+ printk(" [<%016lx>]\n", address);
+ return;
+ }
+ if (!reliable)
+ strcpy(reliab, "? ");
+
+ if (!modname)
+ modname = delim = "";
+ printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+ address, reliab, delim, modname, delim, symname, offset, symsize);
+#else
+ printk(" [<%016lx>]\n", address);
+#endif
+}
+
+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
+ unsigned *usedp, char **idp)
+{
+#ifndef CONFIG_X86_NO_TSS
+ static char ids[][8] = {
+ [DEBUG_STACK - 1] = "#DB",
+ [NMI_STACK - 1] = "NMI",
+ [DOUBLEFAULT_STACK - 1] = "#DF",
+ [STACKFAULT_STACK - 1] = "#SS",
+ [MCE_STACK - 1] = "#MC",
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+ [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+#endif
+ };
+ unsigned k;
+
+ /*
+ * Iterate over all exception stacks, and figure out whether
+ * 'stack' is in one of them:
+ */
+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+ unsigned long end = per_cpu(orig_ist, cpu).ist[k];
+ /*
+ * Is 'stack' above this exception frame's end?
+ * If yes then skip to the next frame.
+ */
+ if (stack >= end)
+ continue;
+ /*
+ * Is 'stack' above this exception frame's start address?
+ * If yes then we found the right frame.
+ */
+ if (stack >= end - EXCEPTION_STKSZ) {
+ /*
+ * Make sure we only iterate through an exception
+ * stack once. If it comes up for the second time
+ * then there's something wrong going on - just
+ * break out and return NULL:
+ */
+ if (*usedp & (1U << k))
+ break;
+ *usedp |= 1U << k;
+ *idp = ids[k];
+ return (unsigned long *)end;
+ }
+ /*
+ * If this is a debug stack, and if it has a larger size than
+ * the usual exception stacks, then 'stack' might still
+ * be within the lower portion of the debug stack:
+ */
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
+ unsigned j = N_EXCEPTION_STACKS - 1;
+
+ /*
+ * Black magic. A large debug stack is composed of
+ * multiple exception stack entries, which we
+ * iterate through now. Dont look:
+ */
+ do {
+ ++j;
+ end -= EXCEPTION_STKSZ;
+ ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
+ } while (stack < end - EXCEPTION_STKSZ);
+ if (*usedp & (1U << j))
+ break;
+ *usedp |= 1U << j;
+ *idp = ids[j];
+ return (unsigned long *)end;
+ }
+#endif
+ }
+#endif
+ return NULL;
+}
+
+struct ops_and_data {
+ const struct stacktrace_ops *ops;
+ void *data;
+};
+
+static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
+{
+ struct ops_and_data *oad = (struct ops_and_data *)context;
+ int n = 0;
+ unsigned long sp = UNW_SP(info);
+
+ if (arch_unw_user_mode(info))
+ return -1;
+ while (unwind(info) == 0 && UNW_PC(info)) {
+ n++;
+ oad->ops->address(oad->data, UNW_PC(info), 1);
+ if (arch_unw_user_mode(info))
+ break;
+ if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+ && sp > UNW_SP(info))
+ break;
+ sp = UNW_SP(info);
+ }
+ return n;
+}
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size, void *end)
+{
+ void *t = tinfo;
+ if (end) {
+ if (p < end && p >= (end-THREAD_SIZE))
+ return 1;
+ else
+ return 0;
+ }
+ return p > t && p < t + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+ unsigned long addr;
+
+ addr = *stack;
+ if (__kernel_text_address(addr)) {
+ if ((unsigned long) stack == bp + 8) {
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ bp = (unsigned long) frame;
+ } else {
+ ops->address(data, addr, bp == 0);
+ }
+ }
+ stack++;
+ }
+ return bp;
+}
+
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+{
+ const unsigned cpu = get_cpu();
+ unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+ unsigned used = 0;
+ struct thread_info *tinfo;
+
+ if (!tsk)
+ tsk = current;
+ tinfo = task_thread_info(tsk);
+
+ if (call_trace >= 0) {
+ int unw_ret = 0;
+ struct unwind_frame_info info;
+ struct ops_and_data oad = { .ops = ops, .data = data };
+
+ if (regs) {
+ if (unwind_init_frame_info(&info, tsk, regs) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ } else if (tsk == current)
+ unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+ else {
+ if (unwind_init_blocked(&info, tsk) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ }
+ if (unw_ret > 0) {
+ if (call_trace == 1 && !arch_unw_user_mode(&info)) {
+ ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+ UNW_PC(&info));
+ if ((long)UNW_SP(&info) < 0) {
+ ops->warning(data, "Leftover inexact backtrace:\n");
+ stack = (unsigned long *)UNW_SP(&info);
+ if (!stack)
+ return;
+ } else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else if (call_trace >= 1)
+ return;
+ else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else
+ ops->warning(data, "Inexact backtrace:\n");
+ }
+
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (tsk && tsk != current)
+ stack = (unsigned long *)tsk->thread.sp;
+ }
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp) {
+ if (tsk == current) {
+ /* Grab bp right from our regs */
+ asm("movq %%rbp, %0" : "=r" (bp):);
+ } else {
+ /* bp is the last reg pushed by switch_to */
+ bp = *(unsigned long *) tsk->thread.sp;
+ }
+ }
+#endif
+
+
+
+ /*
+ * Print function call entries in all stacks, starting at the
+ * current stack address. If the stacks consist of nested
+ * exceptions
+ */
+ for (;;) {
+ char *id;
+ unsigned long *estack_end;
+ estack_end = in_exception_stack(cpu, (unsigned long)stack,
+ &used, &id);
+
+ if (estack_end) {
+ if (ops->stack(data, id) < 0)
+ break;
+
+ bp = print_context_stack(tinfo, stack, bp, ops,
+ data, estack_end);
+ ops->stack(data, "<EOE>");
+ /*
+ * We link to the next stack via the
+ * second-to-last pointer (index -2 to end) in the
+ * exception stack:
+ */
+ stack = (unsigned long *) estack_end[-2];
+ continue;
+ }
+ if (irqstack_end) {
+ unsigned long *irqstack;
+ irqstack = irqstack_end -
+ (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+
+ if (stack >= irqstack && stack < irqstack_end) {
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+ bp = print_context_stack(tinfo, stack, bp,
+ ops, data, irqstack_end);
+ /*
+ * We link to the next stack (which would be
+ * the process stack normally) the last
+ * pointer (index -1 to end) in the IRQ stack:
+ */
+ stack = (unsigned long *) (irqstack_end[-1]);
+ irqstack_end = NULL;
+ ops->stack(data, "EOI");
+ continue;
+ }
+ }
+ break;
+ }
+
+ /*
+ * This handles the process stack:
+ */
+ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+ put_cpu();
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s\n", msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk(" <%s> ", name);
+ return 0;
+}
+
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+ touch_nmi_watchdog();
+ printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+void
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
+ unsigned long bp)
+{
+ printk("\nCall Trace:\n");
+ dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
+ printk("\n");
+}
+
+static void
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
+ unsigned long bp)
+{
+ unsigned long *stack;
+ int i;
+ const int cpu = smp_processor_id();
+ unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+
+ // debugging aid: "show_stack(NULL, NULL);" prints the
+ // back trace for this cpu.
+
+ if (sp == NULL) {
+ if (tsk)
+ sp = (unsigned long *)tsk->thread.sp;
+ else
+ sp = (unsigned long *)&sp;
+ }
+
+ stack = sp;
+ for(i=0; i < kstack_depth_to_print; i++) {
+ if (stack >= irqstack && stack <= irqstack_end) {
+ if (stack == irqstack_end) {
+ stack = (unsigned long *) (irqstack_end[-1]);
+ printk(" <EOI> ");
+ }
+ } else {
+ if (((long) stack & (THREAD_SIZE-1)) == 0)
+ break;
+ }
+ if (i && ((i % 4) == 0))
+ printk("\n");
+ printk(" %016lx", *stack++);
+ touch_nmi_watchdog();
+ }
+ show_trace(tsk, regs, sp, bp);
+}
+
+void show_stack(struct task_struct *tsk, unsigned long * sp)
+{
+ _show_stack(tsk, NULL, sp, 0);
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+ unsigned long dummy;
+ unsigned long bp = 0;
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+ asm("movq %%rbp, %0" : "=r" (bp):);
+#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ show_trace(NULL, NULL, &dummy, bp);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+ int i;
+ unsigned long sp;
+ const int cpu = smp_processor_id();
+ struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ u8 *ip;
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+
+ sp = regs->sp;
+ ip = (u8 *) regs->ip - code_prologue;
+ printk("CPU %d ", cpu);
+ __show_regs(regs);
+ printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+ cur->comm, cur->pid, task_thread_info(cur), cur);
+
+ /*
+ * When in-kernel, we also print out the stack and code at the
+ * time of the fault..
+ */
+ if (!user_mode(regs)) {
+ unsigned char c;
+ printk("Stack: ");
+ _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+ printk("\n");
+
+ printk(KERN_EMERG "Code: ");
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+ /* try starting at RIP */
+ ip = (u8 *) regs->ip;
+ code_len = code_len - code_prologue + 1;
+ }
+ for (i = 0; i < code_len; i++, ip++) {
+ if (ip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(ip, c)) {
+ printk(" Bad RIP value.");
+ break;
+ }
+ if (ip == (u8 *)regs->ip)
+ printk("<%02x> ", c);
+ else
+ printk("%02x ", c);
+ }
+ }
+ printk("\n");
+}
+
+int is_valid_bugaddr(unsigned long ip)
+{
+ unsigned short ud2;
+
+ if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
+ return 0;
+
+ return ud2 == 0x0b0f;
+}
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ oops_enter();
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!__raw_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ __raw_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ die_owner = -1;
+ bust_spinlocks(0);
+ die_nest_count--;
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+ if (!regs) {
+ oops_exit();
+ return;
+ }
+ if (panic_on_oops)
+ panic("Fatal exception");
+ oops_exit();
+ do_exit(signr);
+}
+
+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+{
+ static int die_counter;
+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC");
+#endif
+ printk("\n");
+ sysfs_printk_last_file();
+ if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+ show_registers(regs);
+ add_taint(TAINT_DIE);
+ /* Executive summary in case the oops scrolled away */
+ printk(KERN_ALERT "RIP ");
+ printk_address(regs->ip, 1);
+ printk(" RSP <%016lx>\n", regs->sp);
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
+ return 0;
+}
+
+void die(const char * str, struct pt_regs * regs, long err)
+{
+ unsigned long flags = oops_begin();
+
+ if (!user_mode(regs))
+ report_bug(regs->ip, regs);
+
+ if (__die(str, regs, err))
+ regs = NULL;
+ oops_end(flags, regs, SIGSEGV);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+notrace __kprobes void
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+ unsigned long flags;
+
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
+ NOTIFY_STOP)
+ return;
+
+ flags = oops_begin();
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ printk(str, smp_processor_id());
+ show_registers(regs);
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
+ oops_end(flags, NULL, SIGBUS);
+ nmi_exit();
+ local_irq_enable();
+ do_exit(SIGBUS);
+}
+#endif
+
+static void __kprobes do_trap(int trapnr, int signr, char *str,
+ struct pt_regs * regs, long error_code,
+ siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+
+ if (user_mode(regs)) {
+ /*
+ * We want error_code and trap_no set for userspace
+ * faults and kernelspace faults which result in
+ * die(), but not kernelspace faults which are fixed
+ * up. die() gives the process no chance to handle
+ * the signal and notice the kernel fault information,
+ * so that won't result in polluting the information
+ * about previously queued, but not yet delivered,
+ * faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ if (info)
+ force_sig_info(signr, info, tsk);
+ else
+ force_sig(signr, tsk);
+ return;
+ }
+
+
+ if (!fixup_exception(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+ die(str, regs, error_code);
+ }
+ return;
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ conditional_sti(regs); \
+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+ siginfo_t info; \
+ info.si_signo = signr; \
+ info.si_errno = 0; \
+ info.si_code = sicode; \
+ info.si_addr = (void __user *)siaddr; \
+ trace_hardirqs_fixup(); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ conditional_sti(regs); \
+ do_trap(trapnr, signr, str, regs, error_code, &info); \
+}
+
+DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
+DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
+DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, SIGSEGV, "reserved", reserved)
+
+/* Runs on IST stack */
+asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+ if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+ 12, SIGBUS) == NOTIFY_STOP)
+ return;
+ preempt_conditional_sti(regs);
+ do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+}
+
+asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
+{
+ static const char str[] = "double fault";
+ struct task_struct *tsk = current;
+
+ /* Return not checked because double check cannot be ignored */
+ notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 8;
+
+ /* This is always a kernel trap and never fixable (and thus must
+ never return). */
+ for (;;)
+ die(str, regs, error_code);
+}
+
+asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
+ long error_code)
+{
+ struct task_struct *tsk = current;
+
+ conditional_sti(regs);
+
+ if (user_mode(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ force_sig(SIGSEGV, tsk);
+ return;
+ }
+
+ if (fixup_exception(regs))
+ return;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
+ if (notify_die(DIE_GPF, "general protection fault", regs,
+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
+ return;
+ die("general protection fault", regs, error_code);
+}
+
+static notrace __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+ reason);
+ printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+ if(edac_handler_set()) {
+ edac_atomic_assert_error();
+ return;
+ }
+#endif
+
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+
+ /* Clear and disable the memory parity error line. */
+ clear_mem_error(reason);
+}
+
+static notrace __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+ printk("NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ clear_io_check_error(reason);
+}
+
+static notrace __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ return;
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+ reason);
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+}
+
+/* Runs on IST stack. This code must keep interrupts off all the time.
+ Nested NMIs are prevented by the CPU. */
+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+{
+ unsigned char reason = 0;
+ int cpu;
+
+ cpu = smp_processor_id();
+
+ /* Only the BSP gets external NMIs from the system. */
+ if (!cpu)
+ reason = get_nmi_reason();
+
+ if (!(reason & 0xc0)) {
+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP)
+ return;
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+ /*
+ * Ok, so this is none of the documented NMI sources,
+ * so it must be the NMI watchdog.
+ */
+ if (nmi_watchdog_tick(regs,reason))
+ return;
+#endif
+ if (!do_nmi_callback(regs,cpu))
+ unknown_nmi_error(reason, regs);
+
+ return;
+ }
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ /* AK: following checks seem to be broken on modern chipsets. FIXME */
+
+ if (reason & 0x80)
+ mem_parity_error(reason, regs);
+ if (reason & 0x40)
+ io_check_error(reason, regs);
+}
+
+/* runs on IST stack. */
+asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
+{
+ trace_hardirqs_fixup();
+
+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+ return;
+ }
+ preempt_conditional_sti(regs);
+ do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+}
+
+/* Help handler running on IST stack to switch back to user stack
+ for scheduling or signal handling. The actual stack switch is done in
+ entry.S */
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+ struct pt_regs *regs = eregs;
+ /* Did already sync */
+ if (eregs == (struct pt_regs *)eregs->sp)
+ ;
+ /* Exception from user space */
+ else if (user_mode(eregs))
+ regs = task_pt_regs(current);
+ /* Exception from kernel and interrupts are enabled. Move to
+ kernel process stack. */
+ else if (eregs->flags & X86_EFLAGS_IF)
+ regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
+ if (eregs != regs)
+ *regs = *eregs;
+ return regs;
+}
+
+/* runs on IST stack. */
+asmlinkage void __kprobes do_debug(struct pt_regs * regs,
+ unsigned long error_code)
+{
+ unsigned long condition;
+ struct task_struct *tsk = current;
+ siginfo_t info;
+
+ trace_hardirqs_fixup();
+
+ get_debugreg(condition, 6);
+
+ /*
+ * The processor cleared BTF, so don't mark that we need it set.
+ */
+ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+ tsk->thread.debugctlmsr = 0;
+
+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+ SIGTRAP) == NOTIFY_STOP)
+ return;
+
+ preempt_conditional_sti(regs);
+
+ /* Mask out spurious debug traps due to lazy DR7 setting */
+ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+ if (!tsk->thread.debugreg7) {
+ goto clear_dr7;
+ }
+ }
+
+ tsk->thread.debugreg6 = condition;
+
+
+ /*
+ * Single-stepping through TF: make sure we ignore any events in
+ * kernel space (but re-enable TF when returning to user mode).
+ */
+ if (condition & DR_STEP) {
+ if (!user_mode(regs))
+ goto clear_TF_reenable;
+ }
+
+ /* Ok, finally something we can handle */
+ tsk->thread.trap_no = 1;
+ tsk->thread.error_code = error_code;
+ info.si_signo = SIGTRAP;
+ info.si_errno = 0;
+ info.si_code = TRAP_BRKPT;
+ info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
+ force_sig_info(SIGTRAP, &info, tsk);
+
+clear_dr7:
+ set_debugreg(0UL, 7);
+ preempt_conditional_cli(regs);
+ return;
+
+clear_TF_reenable:
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+ regs->flags &= ~X86_EFLAGS_TF;
+ preempt_conditional_cli(regs);
+}
+
+static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
+{
+ if (fixup_exception(regs))
+ return 1;
+
+ notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
+ /* Illegal floating point operation in the kernel */
+ current->thread.trap_no = trapnr;
+ die(str, regs, 0);
+ return 0;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+asmlinkage void do_coprocessor_error(struct pt_regs *regs)
+{
+ void __user *ip = (void __user *)(regs->ip);
+ struct task_struct * task;
+ siginfo_t info;
+ unsigned short cwd, swd;
+
+ conditional_sti(regs);
+ if (!user_mode(regs) &&
+ kernel_math_error(regs, "kernel x87 math error", 16))
+ return;
+
+ /*
+ * Save the info for the exception handler and clear the error.
+ */
+ task = current;
+ save_init_fpu(task);
+ task->thread.trap_no = 16;
+ task->thread.error_code = 0;
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+ info.si_addr = ip;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+ * C1 reg you need in case of a stack fault, 0x040 is the stack
+ * fault bit. We should only be taking one exception at a time,
+ * so if this combination doesn't produce any single exception,
+ * then we have a bad program that isn't synchronizing its FPU usage
+ * and it will suffer the consequences since we won't be able to
+ * fully reproduce the context of the exception
+ */
+ cwd = get_fpu_cwd(task);
+ swd = get_fpu_swd(task);
+ switch (swd & ~cwd & 0x3f) {
+ case 0x000:
+ default:
+ break;
+ case 0x001: /* Invalid Op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
+ }
+ force_sig_info(SIGFPE, &info, task);
+}
+
+asmlinkage void bad_intr(void)
+{
+ printk("bad interrupt");
+}
+
+asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
+{
+ void __user *ip = (void __user *)(regs->ip);
+ struct task_struct * task;
+ siginfo_t info;
+ unsigned short mxcsr;
+
+ conditional_sti(regs);
+ if (!user_mode(regs) &&
+ kernel_math_error(regs, "kernel simd math error", 19))
+ return;
+
+ /*
+ * Save the info for the exception handler and clear the error.
+ */
+ task = current;
+ save_init_fpu(task);
+ task->thread.trap_no = 19;
+ task->thread.error_code = 0;
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+ info.si_addr = ip;
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+ * unmasked exception was caught we must mask the exception mask bits
+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+ */
+ mxcsr = get_fpu_mxcsr(task);
+ switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+ case 0x000:
+ default:
+ break;
+ case 0x001: /* Invalid Op */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
+ }
+ force_sig_info(SIGFPE, &info, task);
+}
+
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+{
+}
+
+#ifndef CONFIG_XEN
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+#endif
+
+/*
+ * 'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ */
+asmlinkage void math_state_restore(void)
+{
+ struct task_struct *me = current;
+
+ if (!used_math()) {
+ local_irq_enable();
+ /*
+ * does a slab alloc which can sleep
+ */
+ if (init_fpu(me)) {
+ /*
+ * ran out of memory!
+ */
+ do_group_exit(SIGKILL);
+ return;
+ }
+ local_irq_disable();
+ }
+
+ /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
+
+ restore_fpu_checking(&me->thread.xstate->fxsave);
+ task_thread_info(me)->status |= TS_USEDFPU;
+ me->fpu_counter++;
+}
+EXPORT_SYMBOL_GPL(math_state_restore);
+
+
+/*
+ * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
+ * specify <dpl>|4 in the second field.
+ */
+static const trap_info_t __cpuinitconst trap_table[] = {
+ { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
+ { 4, 3|4, __KERNEL_CS, (unsigned long)overflow },
+ { 5, 0|4, __KERNEL_CS, (unsigned long)bounds },
+ { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op },
+ { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
+ { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
+ { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS },
+ { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present },
+ { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment },
+ { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection },
+ { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
+ { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug },
+ { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error },
+ { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check },
+#ifdef CONFIG_X86_MCE
+ { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check },
+#endif
+ { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
+#ifdef CONFIG_IA32_EMULATION
+ { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall},
+#endif
+ { 0, 0, 0, 0 }
+};
+
+void __init trap_init(void)
+{
+ int ret;
+
+ ret = HYPERVISOR_set_trap_table(trap_table);
+ if (ret)
+ printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
+
+ /*
+ * initialize the per thread extended state:
+ */
+ init_thread_xstate();
+ /*
+ * Should be a barrier for any external CPU state.
+ */
+ cpu_init();
+}
+
+void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
+{
+ const trap_info_t *t = trap_table;
+
+ for (t = trap_table; t->address; t++) {
+ trap_ctxt[t->vector].flags = t->flags;
+ trap_ctxt[t->vector].cs = t->cs;
+ trap_ctxt[t->vector].address = t->address;
+ }
+}
+
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ kstack_depth_to_print = simple_strtoul(s,NULL,0);
+ return 0;
+}
+early_param("kstack", kstack_setup);
+
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
+
+#ifdef CONFIG_STACK_UNWIND
+static int __init call_trace_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (strcmp(s, "old") == 0)
+ call_trace = -1;
+ else if (strcmp(s, "both") == 0)
+ call_trace = 0;
+ else if (strcmp(s, "newfallback") == 0)
+ call_trace = 1;
+ else if (strcmp(s, "new") == 0)
+ call_trace = 2;
+ return 0;
+}
+early_param("call_trace", call_trace_setup);
+#endif
struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
{
+#ifndef CONFIG_X86_NO_TSS
struct tss_struct *tss;
+#endif
struct pt_regs *ret;
unsigned long tmp;
do_exit(SIGSEGV);
}
+#ifndef CONFIG_X86_NO_TSS
tss = &per_cpu(init_tss, get_cpu());
+#endif
current->thread.sp0 = current->thread.saved_sp0;
current->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, ¤t->thread);
current->thread.saved_sp0 = 0;
+#ifndef CONFIG_X86_NO_TSS
put_cpu();
+#endif
ret = KVM86->regs32;
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
{
+#ifndef CONFIG_X86_NO_TSS
struct tss_struct *tss;
+#endif
/*
* make sure the vm86() system call doesn't try to do anything silly
*/
tsk->thread.saved_fs = info->regs32->fs;
savesegment(gs, tsk->thread.saved_gs);
+#ifndef CONFIG_X86_NO_TSS
tss = &per_cpu(init_tss, get_cpu());
+#endif
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
if (cpu_has_sep)
tsk->thread.sysenter_cs = 0;
load_sp0(tss, &tsk->thread);
+#ifndef CONFIG_X86_NO_TSS
put_cpu();
+#endif
tsk->thread.screen_bitmap = info->screen_bitmap;
if (info->flags & VM86_SCREEN_BITMAP)
SECTIONS
{
. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+
+#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002
+#undef LOAD_OFFSET
+#define LOAD_OFFSET 0
+#endif
+
phys_startup_32 = startup_32 - LOAD_OFFSET;
.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
--- /dev/null
+/*
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ * Thanks to hpa@transmeta.com for some useful hint.
+ * Special thanks to Ingo Molnar for his early experience with
+ * a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ * at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ * jumping out of line if necessary. We cannot add more with this
+ * mechanism because older kernels won't return -ENOSYS.
+ * If we want more than four we need a vDSO.
+ *
+ * Note: the concept clashes with user mode linux. If you use UML and
+ * want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/clocksource.h>
+#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __syscall_clobber "r11","cx","memory"
+
+/*
+ * vsyscall_gtod_data contains data that is :
+ * - readonly from vsyscalls
+ * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
+ * Try to keep this structure as small as possible to avoid cache line ping pongs
+ */
+int __vgetcpu_mode __section_vgetcpu_mode;
+
+struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+{
+ .lock = SEQLOCK_UNLOCKED,
+ .sysctl_enabled = 1,
+};
+
+void update_vsyscall_tz(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+ /* sys_tz has changed */
+ vsyscall_gtod_data.sys_tz = sys_tz;
+ write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+ /* copy vsyscall data */
+ vsyscall_gtod_data.clock.vread = clock->vread;
+ vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+ vsyscall_gtod_data.clock.mask = clock->mask;
+ vsyscall_gtod_data.clock.mult = clock->mult;
+ vsyscall_gtod_data.clock.shift = clock->shift;
+ vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+ vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+ vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+ write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+
+/* RED-PEN may want to readd seq locking, but then the variable should be
+ * write-once.
+ */
+static __always_inline void do_get_tz(struct timezone * tz)
+{
+ *tz = __vsyscall_gtod_data.sys_tz;
+}
+
+static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ int ret;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+ : __syscall_clobber );
+ return ret;
+}
+
+static __always_inline long time_syscall(long *t)
+{
+ long secs;
+ asm volatile("syscall"
+ : "=a" (secs)
+ : "0" (__NR_time),"D" (t) : __syscall_clobber);
+ return secs;
+}
+
+static __always_inline void do_vgettimeofday(struct timeval * tv)
+{
+ cycle_t now, base, mask, cycle_delta;
+ unsigned seq;
+ unsigned long mult, shift, nsec;
+ cycle_t (*vread)(void);
+ do {
+ seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+ vread = __vsyscall_gtod_data.clock.vread;
+ if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+ gettimeofday(tv,NULL);
+ return;
+ }
+ now = vread();
+ base = __vsyscall_gtod_data.clock.cycle_last;
+ mask = __vsyscall_gtod_data.clock.mask;
+ mult = __vsyscall_gtod_data.clock.mult;
+ shift = __vsyscall_gtod_data.clock.shift;
+
+ tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
+ nsec = __vsyscall_gtod_data.wall_time_nsec;
+ } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
+ /* calculate interval: */
+ cycle_delta = (now - base) & mask;
+ /* convert to nsecs: */
+ nsec += (cycle_delta * mult) >> shift;
+
+ while (nsec >= NSEC_PER_SEC) {
+ tv->tv_sec += 1;
+ nsec -= NSEC_PER_SEC;
+ }
+ tv->tv_usec = nsec / NSEC_PER_USEC;
+}
+
+int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+ if (tv)
+ do_vgettimeofday(tv);
+ if (tz)
+ do_get_tz(tz);
+ return 0;
+}
+
+/* This will break when the xtime seconds get inaccurate, but that is
+ * unlikely */
+time_t __vsyscall(1) vtime(time_t *t)
+{
+ struct timeval tv;
+ time_t result;
+ if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+ return time_syscall(t);
+
+ vgettimeofday(&tv, NULL);
+ result = tv.tv_sec;
+ if (t)
+ *t = result;
+ return result;
+}
+
+/* Fast way to get current CPU and node.
+ This helps to do per node and per CPU caches in user space.
+ The result is not guaranteed without CPU affinity, but usually
+ works out because the scheduler tries to keep a thread on the same
+ CPU.
+
+ tcache must point to a two element sized long array.
+ All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+ unsigned int p;
+ unsigned long j = 0;
+
+ /* Fast cache - only recompute value once per jiffies and avoid
+ relatively costly rdtscp/cpuid otherwise.
+ This works because the scheduler usually keeps the process
+ on the same CPU and this syscall doesn't guarantee its
+ results anyways.
+ We do this here because otherwise user space would do it on
+ its own in a likely inferior way (no access to jiffies).
+ If you don't like it pass NULL. */
+ if (tcache && tcache->blob[0] == (j = __jiffies)) {
+ p = tcache->blob[1];
+ } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+ /* Load per CPU data from RDTSCP */
+ native_read_tscp(&p);
+ } else {
+ /* Load per CPU data from GDT */
+ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ }
+ if (tcache) {
+ tcache->blob[0] = j;
+ tcache->blob[1] = p;
+ }
+ if (cpu)
+ *cpu = p & 0xfff;
+ if (node)
+ *node = p >> 12;
+ return 0;
+}
+
+static long __vsyscall(3) venosys_1(void)
+{
+ return -ENOSYS;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int
+vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+}
+
+static ctl_table kernel_table2[] = {
+ { .procname = "vsyscall64",
+ .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = vsyscall_sysctl_change },
+ {}
+};
+
+static ctl_table kernel_root_table2[] = {
+ { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
+ .child = kernel_table2 },
+ {}
+};
+#endif
+
+/* Assume __initcall executes before all user space. Hopefully kmod
+ doesn't violate that. We'll find out if it does. */
+static void __cpuinit vsyscall_set_cpu(int cpu)
+{
+ unsigned long d;
+ unsigned long node = 0;
+#ifdef CONFIG_NUMA
+ node = cpu_to_node(cpu);
+#endif
+ if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+ write_rdtscp_aux((node << 12) | cpu);
+
+ /* Store cpu number in limit so that it can be loaded quickly
+ in user space in vgetcpu.
+ 12 bits for the CPU and 8 bits for the node. */
+ d = 0x0f40000000000ULL;
+ d |= cpu;
+ d |= (node & 0xf) << 12;
+ d |= (node >> 4) << 48;
+ if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
+ + GDT_ENTRY_PER_CPU),
+ d))
+ BUG();
+}
+
+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+ /* preemption should be already off */
+ vsyscall_set_cpu(raw_smp_processor_id());
+}
+
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+ long cpu = (long)arg;
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+ smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+ return NOTIFY_DONE;
+}
+
+void __init map_vsyscall(void)
+{
+ extern char __vsyscall_0;
+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+
+ /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+}
+
+static int __init vsyscall_init(void)
+{
+ BUG_ON(((unsigned long) &vgettimeofday !=
+ VSYSCALL_ADDR(__NR_vgettimeofday)));
+ BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
+ BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+ BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
+#ifdef CONFIG_XEN
+ vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
+ if (boot_cpu_has(X86_FEATURE_RDTSCP))
+ vgetcpu_mode = VGETCPU_RDTSCP;
+ else
+ vgetcpu_mode = VGETCPU_LSL;
+#endif
+#ifdef CONFIG_SYSCTL
+ register_sysctl_table(kernel_root_table2);
+#endif
+ on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+ hotcpu_notifier(cpu_vsyscall_notifier, 0);
+ return 0;
+}
+
+__initcall(vsyscall_init);
menuconfig VIRTUALIZATION
bool "Virtualization"
depends on HAVE_KVM || X86
+ depends on !XEN
default y
---help---
Say Y here to get to see options for using your Linux host to run other
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
endif
+
+lib-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o
--- /dev/null
+#include <asm/cpufeature.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+
+void scrub_pages(void *v, unsigned int count)
+{
+ if (likely(cpu_has_xmm2)) {
+ unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
+
+ for (; n--; v += sizeof(long) * 4)
+ asm("movnti %1,(%0)\n\t"
+ "movnti %1,%c2(%0)\n\t"
+ "movnti %1,2*%c2(%0)\n\t"
+ "movnti %1,3*%c2(%0)\n\t"
+ : : "r" (v), "r" (0L), "i" (sizeof(long))
+ : "memory");
+ asm volatile("sfence" : : : "memory");
+ } else
+ for (; count--; v += PAGE_SIZE)
+ clear_page(v);
+}
--- /dev/null
+#
+# Makefile for the linux kernel.
+#
+
+obj-y := setup.o
--- /dev/null
+/*
+ * Machine specific setup for generic
+ */
+
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <asm/acpi.h>
+#include <asm/arch_hooks.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+
+#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+int no_broadcast=DEFAULT_SEND_IPI;
+
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
+ "IPI Broadcast");
+ return 1;
+}
+
+__setup("no_ipi_broadcast", no_ipi_broadcast);
+
+static int __init print_ipi_mode(void)
+{
+ printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
+ "Shortcut");
+ return 0;
+}
+
+late_initcall(print_ipi_mode);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ *
+ * Description:
+ * This is included late in kernel/setup.c so that it can make
+ * use of all of the static functions.
+ **/
+
+char * __init machine_specific_memory_setup(void)
+{
+ int rc;
+ struct xen_memory_map memmap;
+ /*
+ * This is rather large for a stack variable but this early in
+ * the boot process we know we have plenty slack space.
+ */
+ struct e820entry map[E820MAX];
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if ( rc == -ENOSYS ) {
+ memmap.nr_entries = 1;
+ map[0].addr = 0ULL;
+ map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
+ /* 8MB slack (to balance backend allocations). */
+ map[0].size += 8ULL << 20;
+ map[0].type = E820_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
+
+ sanitize_e820_map(map, (char *)&memmap.nr_entries);
+
+ BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
+
+ return "Xen";
+}
+
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
+void __init pre_setup_arch_hook(void)
+{
+ struct xen_machphys_mapping mapping;
+ unsigned long machine_to_phys_nr_ents;
+ struct xen_platform_parameters pp;
+
+ init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
+
+ xen_setup_features();
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
+ hypervisor_virt_start = pp.virt_start;
+ reserve_top_address(0UL - pp.virt_start);
+ }
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
+ } else
+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ phys_to_machine_mapping =
+ (unsigned long *)xen_start_info->mfn_list;
+}
+
+void __init machine_specific_arch_setup(void)
+{
+ int ret;
+ static struct callback_register __initdata event = {
+ .type = CALLBACKTYPE_event,
+ .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
+ };
+ static struct callback_register __initdata failsafe = {
+ .type = CALLBACKTYPE_failsafe,
+ .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
+ };
+ static struct callback_register __initdata nmi_cb = {
+ .type = CALLBACKTYPE_nmi,
+ .address = { __KERNEL_CS, (unsigned long)nmi },
+ };
+
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
+ if (ret == 0)
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (ret == -ENOSYS)
+ ret = HYPERVISOR_set_callbacks(
+ event.address.cs, event.address.eip,
+ failsafe.address.cs, failsafe.address.eip);
+#endif
+ BUG_ON(ret);
+
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (ret == -ENOSYS) {
+ static struct xennmi_callback __initdata cb = {
+ .handler_address = (unsigned long)nmi
+ };
+
+ HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
+ }
+#endif
+
+ /* Do an early initialization of the fixmap area */
+ {
+ extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
+ unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+ pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+ }
+}
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_64.o
endif
+
+obj-$(CONFIG_XEN) += hypervisor.o
--- /dev/null
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h> /* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+
+/*
+ * Page fault error code bits
+ * bit 0 == 0 means no page found, 1 means protection fault
+ * bit 1 == 0 means read, 1 means write
+ * bit 2 == 0 means kernel, 1 means user-mode
+ * bit 3 == 1 means use of reserved bit detected
+ * bit 4 == 1 means fault was an instruction fetch
+ */
+#define PF_PROT (1<<0)
+#define PF_WRITE (1<<1)
+#define PF_USER (1<<2)
+#define PF_RSVD (1<<3)
+#define PF_INSTR (1<<4)
+
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+#ifdef CONFIG_KPROBES
+ int ret = 0;
+
+ /* kprobe_running() needs smp_processor_id() */
+#ifdef CONFIG_X86_32
+ if (!user_mode_vm(regs)) {
+#else
+ if (!user_mode(regs)) {
+#endif
+ preempt_disable();
+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
+ ret = 1;
+ preempt_enable();
+ }
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+/*
+ * X86_32
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * X86_64
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner
+ */
+static int is_prefetch(struct pt_regs *regs, unsigned long addr,
+ unsigned long error_code)
+{
+ unsigned char *instr;
+ int scan_more = 1;
+ int prefetch = 0;
+ unsigned char *max_instr;
+
+ /*
+ * If it was a exec (instruction fetch) fault on NX page, then
+ * do not ignore the fault:
+ */
+ if (error_code & PF_INSTR)
+ return 0;
+
+ instr = (unsigned char *)convert_ip_to_linear(current, regs);
+ max_instr = instr + 15;
+
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+ return 0;
+
+ while (scan_more && instr < max_instr) {
+ unsigned char opcode;
+ unsigned char instr_hi;
+ unsigned char instr_lo;
+
+ if (probe_kernel_address(instr, opcode))
+ break;
+
+ instr_hi = opcode & 0xf0;
+ instr_lo = opcode & 0x0f;
+ instr++;
+
+ switch (instr_hi) {
+ case 0x20:
+ case 0x30:
+ /*
+ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+ * In X86_64 long mode, the CPU will signal invalid
+ * opcode if some of these prefixes are present so
+ * X86_64 will never get here anyway
+ */
+ scan_more = ((instr_lo & 7) == 0x6);
+ break;
+#ifdef CONFIG_X86_64
+ case 0x40:
+ /*
+ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+ * Need to figure out under what instruction mode the
+ * instruction was issued. Could check the LDT for lm,
+ * but for now it's good enough to assume that long
+ * mode only uses well known segments or kernel.
+ */
+ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+ break;
+#endif
+ case 0x60:
+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
+ scan_more = (instr_lo & 0xC) == 0x4;
+ break;
+ case 0xF0:
+ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+ scan_more = !instr_lo || (instr_lo>>1) == 1;
+ break;
+ case 0x00:
+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
+ scan_more = 0;
+
+ if (probe_kernel_address(instr, opcode))
+ break;
+ prefetch = (instr_lo == 0xF) &&
+ (opcode == 0x0D || opcode == 0x18);
+ break;
+ default:
+ scan_more = 0;
+ break;
+ }
+ }
+ return prefetch;
+}
+
+static void force_sig_info_fault(int si_signo, int si_code,
+ unsigned long address, struct task_struct *tsk)
+{
+ siginfo_t info;
+
+ info.si_signo = si_signo;
+ info.si_errno = 0;
+ info.si_code = si_code;
+ info.si_addr = (void __user *)address;
+ force_sig_info(si_signo, &info, tsk);
+}
+
+#ifdef CONFIG_X86_64
+static int bad_address(void *p)
+{
+ unsigned long dummy;
+ return probe_kernel_address((unsigned long *)p, dummy);
+}
+#endif
+
+static void dump_pagetable(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+ __typeof__(pte_val(__pte(0))) page;
+
+ page = read_cr3();
+ page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+ printk("*pdpt = %016Lx ", page);
+ if ((page & _PAGE_PRESENT)
+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
+ page = mfn_to_pfn(page >> PAGE_SHIFT);
+ page <<= PAGE_SHIFT;
+ page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+ & (PTRS_PER_PMD - 1)];
+ printk(KERN_CONT "*pde = %016Lx ", page);
+ page &= ~_PAGE_NX;
+ }
+#else
+ printk("*pde = %08lx ", page);
+#endif
+
+ /*
+ * We must not directly access the pte in the highpte
+ * case if the page table is located in highmem.
+ * And let's rather not kmap-atomic the pte, just in case
+ * it's allocated already.
+ */
+ if ((page & _PAGE_PRESENT)
+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
+ && !(page & _PAGE_PSE)) {
+ page = mfn_to_pfn(page >> PAGE_SHIFT);
+ page <<= PAGE_SHIFT;
+ page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+ & (PTRS_PER_PTE - 1)];
+ printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
+ }
+
+ printk(KERN_CONT "\n");
+#else /* CONFIG_X86_64 */
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = (pgd_t *)read_cr3();
+
+ pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
+ if (bad_address(pgd)) goto bad;
+ printk("PGD %lx ", pgd_val(*pgd));
+ if (!pgd_present(*pgd)) goto ret;
+
+ pud = pud_offset(pgd, address);
+ if (bad_address(pud)) goto bad;
+ printk(KERN_CONT "PUD %lx ", pud_val(*pud));
+ if (!pud_present(*pud) || pud_large(*pud))
+ goto ret;
+
+ pmd = pmd_offset(pud, address);
+ if (bad_address(pmd)) goto bad;
+ printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
+ if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (bad_address(pte)) goto bad;
+ printk(KERN_CONT "PTE %lx", pte_val(*pte));
+ret:
+ printk(KERN_CONT "\n");
+ return;
+bad:
+ printk("BAD\n");
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+ unsigned index = pgd_index(address);
+ pgd_t *pgd_k;
+ pud_t *pud, *pud_k;
+ pmd_t *pmd, *pmd_k;
+
+ pgd += index;
+ pgd_k = init_mm.pgd + index;
+
+ if (!pgd_present(*pgd_k))
+ return NULL;
+
+ /*
+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
+ * and redundant with the set_pmd() on non-PAE. As would
+ * set_pud.
+ */
+
+ pud = pud_offset(pgd, address);
+ pud_k = pud_offset(pgd_k, address);
+ if (!pud_present(*pud_k))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ pmd_k = pmd_offset(pud_k, address);
+ if (!pmd_present(*pmd_k))
+ return NULL;
+ if (!pmd_present(*pmd)) {
+#if CONFIG_XEN_COMPAT > 0x030002
+ set_pmd(pmd, *pmd_k);
+#else
+ /*
+ * When running on older Xen we must launder *pmd_k through
+ * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
+ */
+ set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
+#endif
+ arch_flush_lazy_mmu_mode();
+ } else
+ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+ return pmd_k;
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+ BIOS SMM functions are required to use a specific workaround
+ to avoid corruption of the 64bit RIP register on C stepping K8.
+ A lot of BIOS that didn't get tested properly miss this.
+ The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ Try to work around it here.
+ Note we only handle faults in kernel here.
+ Does nothing for X86_32
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ static int warned;
+ if (address != regs->ip)
+ return 0;
+ if ((address >> 32) != 0)
+ return 0;
+ address |= 0xffffffffUL << 32;
+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
+ (address >= MODULES_VADDR && address <= MODULES_END)) {
+ if (!warned) {
+ printk(errata93_warning);
+ warned = 1;
+ }
+ regs->ip = address;
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
+ * addresses >4GB. We catch this in the page fault handler because these
+ * addresses are not reachable. Just detect this case and return. Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+ (address >> 32))
+ return 1;
+#endif
+ return 0;
+}
+
+void do_invalid_op(struct pt_regs *, unsigned long);
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+ unsigned long nr;
+ /*
+ * Pentium F0 0F C7 C8 bug workaround.
+ */
+ if (boot_cpu_data.f00f_bug) {
+ nr = (address - idt_descr.address) >> 3;
+
+ if (nr == 6) {
+ do_invalid_op(regs, 0);
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+#ifdef CONFIG_X86_32
+ if (!oops_may_print())
+ return;
+#endif
+
+#ifdef CONFIG_X86_PAE
+ if (error_code & PF_INSTR) {
+ unsigned int level;
+ pte_t *pte = lookup_address(address, &level);
+
+ if (pte && pte_present(*pte) && !pte_exec(*pte))
+ printk(KERN_CRIT "kernel tried to execute "
+ "NX-protected page - exploit attempt? "
+ "(uid: %d)\n", current->uid);
+ }
+#endif
+
+ printk(KERN_ALERT "BUG: unable to handle kernel ");
+ if (address < PAGE_SIZE)
+ printk(KERN_CONT "NULL pointer dereference");
+ else
+ printk(KERN_CONT "paging request");
+#ifdef CONFIG_X86_32
+ printk(KERN_CONT " at %08lx\n", address);
+#else
+ printk(KERN_CONT " at %016lx\n", address);
+#endif
+ printk(KERN_ALERT "IP:");
+ printk_address(regs->ip, 1);
+ dump_pagetable(address);
+}
+
+#ifdef CONFIG_X86_64
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ unsigned long flags = oops_begin();
+ struct task_struct *tsk;
+
+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+ current->comm, address);
+ dump_pagetable(address);
+ tsk = current;
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+ if (__die("Bad pagetable", regs, error_code))
+ regs = NULL;
+ oops_end(flags, regs, SIGKILL);
+}
+#endif
+
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+ if ((error_code & PF_WRITE) && !pte_write(*pte))
+ return 0;
+ if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry. This allows
+ * us to lazily refresh the TLB when increasing the permissions of a
+ * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
+ * expensive since that implies doing a full cross-processor TLB
+ * flush, even if no stale TLB entries exist on other processors.
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static int spurious_fault(unsigned long address,
+ unsigned long error_code)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /* Reserved-bit violation or user access to kernel space? */
+ if (error_code & (PF_USER | PF_RSVD))
+ return 0;
+
+ pgd = init_mm.pgd + pgd_index(address);
+ if (!pgd_present(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return 0;
+
+ if (pud_large(*pud))
+ return spurious_fault_check(error_code, (pte_t *) pud);
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ if (pmd_large(*pmd))
+ return spurious_fault_check(error_code, (pte_t *) pmd);
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+
+ return spurious_fault_check(error_code, pte);
+}
+
+/*
+ * X86_32
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * X86_64
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+
+ /* Make sure we are in vmalloc area */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ /*
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "current" here. We might be inside
+ * an interrupt in the middle of a task switch..
+ */
+ pgd_paddr = read_cr3();
+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ if (!pmd_k)
+ return -1;
+ pte_k = pte_offset_kernel(pmd_k, address);
+ if (!pte_present(*pte_k))
+ return -1;
+ return 0;
+#else
+ pgd_t *pgd, *pgd_ref;
+ pud_t *pud, *pud_ref;
+ pmd_t *pmd, *pmd_ref;
+ pte_t *pte, *pte_ref;
+
+ /* Make sure we are in vmalloc area */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ /* Copy kernel mappings over when needed. This can also
+ happen within a race in page table update. In the later
+ case just flush. */
+
+ /* On Xen the line below does not always work. Needs investigating! */
+ /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
+ pgd_ref = pgd_offset_k(address);
+ if (pgd_none(*pgd_ref))
+ return -1;
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ /* Below here mismatches are bugs because these lower tables
+ are shared */
+
+ pud = pud_offset(pgd, address);
+ pud_ref = pud_offset(pgd_ref, address);
+ if (pud_none(*pud_ref))
+ return -1;
+ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+ BUG();
+ pmd = pmd_offset(pud, address);
+ pmd_ref = pmd_offset(pud_ref, address);
+ if (pmd_none(*pmd_ref))
+ return -1;
+ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+ BUG();
+ pte_ref = pte_offset_kernel(pmd_ref, address);
+ if (!pte_present(*pte_ref))
+ return -1;
+ pte = pte_offset_kernel(pmd, address);
+ /* Don't use pte_page here, because the mappings can point
+ outside mem_map, and the NUMA hash lookup cannot handle
+ that. */
+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+ BUG();
+ return 0;
+#endif
+}
+
+int show_unhandled_signals = 1;
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ unsigned long address;
+ int write, si_code;
+ int fault;
+#ifdef CONFIG_X86_64
+ unsigned long flags;
+#endif
+
+ /*
+ * We can fault from pretty much anywhere, with unknown IRQ state.
+ */
+ trace_hardirqs_fixup();
+
+ /* Set the "privileged fault" bit to something sane. */
+ if (user_mode_vm(regs))
+ error_code |= PF_USER;
+ else
+ error_code &= ~PF_USER;
+
+ tsk = current;
+ mm = tsk->mm;
+ prefetchw(&mm->mmap_sem);
+
+ /* get the address */
+ address = read_cr2();
+
+ si_code = SEGV_MAPERR;
+
+ if (notify_page_fault(regs))
+ return;
+
+ /*
+ * We fault-in kernel-space virtual memory on-demand. The
+ * 'reference' page table is init_mm.pgd.
+ *
+ * NOTE! We MUST NOT take any locks for this case. We may
+ * be in an interrupt or a critical region, and should
+ * only copy the information from the master page table,
+ * nothing more.
+ *
+ * This verifies that the fault happens in kernel space
+ * (error_code & 4) == 0, and that the fault was not a
+ * protection error (error_code & 9) == 0.
+ */
+#ifdef CONFIG_X86_32
+ if (unlikely(address >= TASK_SIZE)) {
+#else
+ if (unlikely(address >= TASK_SIZE64)) {
+#endif
+ /* Faults in hypervisor area can never be patched up. */
+#if defined(CONFIG_X86_XEN)
+ if (address >= hypervisor_virt_start)
+ goto bad_area_nosemaphore;
+#elif defined(CONFIG_X86_64_XEN)
+ /* Faults in hypervisor area are never spurious. */
+ if (address >= HYPERVISOR_VIRT_START
+ && address < HYPERVISOR_VIRT_END)
+ goto bad_area_nosemaphore;
+#endif
+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+ vmalloc_fault(address) >= 0)
+ return;
+
+ /* Can handle a stale RO->RW TLB */
+ if (spurious_fault(address, error_code))
+ return;
+
+ /*
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock.
+ */
+ goto bad_area_nosemaphore;
+ }
+
+
+#ifdef CONFIG_X86_32
+ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+ fault has been handled. */
+ if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
+ local_irq_enable();
+
+ /*
+ * If we're in an interrupt, have no user context or are running in an
+ * atomic region then we must not take the fault.
+ */
+ if (in_atomic() || !mm)
+ goto bad_area_nosemaphore;
+#else /* CONFIG_X86_64 */
+ if (likely(regs->flags & X86_EFLAGS_IF))
+ local_irq_enable();
+
+ if (unlikely(error_code & PF_RSVD))
+ pgtable_bad(address, regs, error_code);
+
+ /*
+ * If we're in an interrupt, have no user context or are running in an
+ * atomic region then we must not take the fault.
+ */
+ if (unlikely(in_atomic() || !mm))
+ goto bad_area_nosemaphore;
+
+ /*
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet.
+ */
+ if (user_mode_vm(regs))
+ error_code |= PF_USER;
+again:
+#endif
+ /* When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in the
+ * kernel and should generate an OOPS. Unfortunately, in the case of an
+ * erroneous fault occurring in a code path which already holds mmap_sem
+ * we will deadlock attempting to validate the fault against the
+ * address space. Luckily the kernel only validly references user
+ * space from well defined areas of code, which are listed in the
+ * exceptions table.
+ *
+ * As the vast majority of faults will be valid we will only perform
+ * the source reference check when there is a possibility of a deadlock.
+ * Attempt to lock the address space, if we cannot we then validate the
+ * source. If this is invalid we can skip the address space check,
+ * thus avoiding the deadlock.
+ */
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ if ((error_code & PF_USER) == 0 &&
+ !search_exception_tables(regs->ip))
+ goto bad_area_nosemaphore;
+ down_read(&mm->mmap_sem);
+ }
+
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto bad_area;
+ if (vma->vm_start <= address)
+ goto good_area;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto bad_area;
+ if (error_code & PF_USER) {
+ /*
+ * Accessing the stack below %sp is always a bug.
+ * The large cushion allows instructions like enter
+ * and pusha to work. ("enter $65535,$31" pushes
+ * 32 pointers and then decrements %sp by 65535.)
+ */
+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
+ goto bad_area;
+ }
+ if (expand_stack(vma, address))
+ goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+ si_code = SEGV_ACCERR;
+ write = 0;
+ switch (error_code & (PF_PROT|PF_WRITE)) {
+ default: /* 3: write, present */
+ /* fall through */
+ case PF_WRITE: /* write, not present */
+ if (!(vma->vm_flags & VM_WRITE))
+ goto bad_area;
+ write++;
+ break;
+ case PF_PROT: /* read, present */
+ goto bad_area;
+ case 0: /* read, not present */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ goto bad_area;
+ }
+
+#ifdef CONFIG_X86_32
+survive:
+#endif
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+ fault = handle_mm_fault(mm, vma, address, write);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ if (fault & VM_FAULT_OOM)
+ goto out_of_memory;
+ else if (fault & VM_FAULT_SIGBUS)
+ goto do_sigbus;
+ BUG();
+ }
+ if (fault & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+
+#ifdef CONFIG_X86_32
+ /*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+ if (v8086_mode(regs)) {
+ unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+ if (bit < 32)
+ tsk->thread.screen_bitmap |= 1 << bit;
+ }
+#endif
+ up_read(&mm->mmap_sem);
+ return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+ up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+ /* User mode accesses just cause a SIGSEGV */
+ if (error_code & PF_USER) {
+ /*
+ * It's possible to have interrupts off here.
+ */
+ local_irq_enable();
+
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space.
+ */
+ if (is_prefetch(regs, address, error_code))
+ return;
+
+ if (is_errata100(regs, address))
+ return;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(
+#ifdef CONFIG_X86_32
+ "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
+#else
+ "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
+#endif
+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, task_pid_nr(tsk), address, regs->ip,
+ regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ tsk->thread.cr2 = address;
+ /* Kernel addresses are always protection faults */
+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+ return;
+ }
+
+ if (is_f00f_bug(regs, address))
+ return;
+
+no_context:
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs))
+ return;
+
+ /*
+ * X86_32
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ *
+ * X86_64
+ * Hall of shame of CPU/BIOS bugs.
+ */
+ if (is_prefetch(regs, address, error_code))
+ return;
+
+ if (is_errata93(regs, address))
+ return;
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+#ifdef CONFIG_X86_32
+ bust_spinlocks(1);
+#else
+ flags = oops_begin();
+#endif
+
+ show_fault_oops(regs, error_code, address);
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+ die("Oops", regs, error_code);
+ bust_spinlocks(0);
+ do_exit(SIGKILL);
+#else
+ if (__die("Oops", regs, error_code))
+ regs = NULL;
+ /* Executive summary in case the body of the oops scrolled away */
+ printk(KERN_EMERG "CR2: %016lx\n", address);
+ oops_end(flags, regs, SIGKILL);
+#endif
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+ up_read(&mm->mmap_sem);
+ if (is_global_init(tsk)) {
+ yield();
+#ifdef CONFIG_X86_32
+ down_read(&mm->mmap_sem);
+ goto survive;
+#else
+ goto again;
+#endif
+ }
+
+ printk("VM: killing process %s\n", tsk->comm);
+ if (error_code & PF_USER)
+ do_group_exit(SIGKILL);
+ goto no_context;
+
+do_sigbus:
+ up_read(&mm->mmap_sem);
+
+ /* Kernel mode? Handle exceptions or die */
+ if (!(error_code & PF_USER))
+ goto no_context;
+#ifdef CONFIG_X86_32
+ /* User space => ok to do another page fault */
+ if (is_prefetch(regs, address, error_code))
+ return;
+#endif
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+void vmalloc_sync_all(void)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * Note that races in the updates of insync and start aren't
+ * problematic: insync can only get set bits added, and updates to
+ * start are only improving performance (without affecting correctness
+ * if undone).
+ * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
+ * This change works just fine with 2-level paging too.
+ */
+#define sync_index(a) ((a) >> PMD_SHIFT)
+ static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
+ static unsigned long start = TASK_SIZE;
+ unsigned long address;
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
+ for (address = start;
+ address < hypervisor_virt_start;
+ address += PMD_SIZE) {
+ if (!test_bit(sync_index(address), insync)) {
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ /* XEN: failure path assumes non-empty pgd_list. */
+ if (unlikely(list_empty(&pgd_list))) {
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ return;
+ }
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (!vmalloc_sync_one(page_address(page),
+ address))
+ break;
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ if (!page)
+ set_bit(sync_index(address), insync);
+ }
+ if (address == start && test_bit(sync_index(address), insync))
+ start = address + PMD_SIZE;
+ }
+#else /* CONFIG_X86_64 */
+ /*
+ * Note that races in the updates of insync and start aren't
+ * problematic: insync can only get set bits added, and updates to
+ * start are only improving performance (without affecting correctness
+ * if undone).
+ */
+ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+ static unsigned long start = VMALLOC_START & PGDIR_MASK;
+ unsigned long address;
+
+ for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+ if (!test_bit(pgd_index(address), insync)) {
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ unsigned long flags;
+ struct page *page;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ set_bit(pgd_index(address), insync);
+ }
+ if (address == start)
+ start = address + PGDIR_SIZE;
+ }
+#endif
+}
--- /dev/null
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+void *kmap(struct page *page)
+{
+ might_sleep();
+ if (!PageHighMem(page))
+ return page_address(page);
+ return kmap_high(page);
+}
+
+void kunmap(struct page *page)
+{
+ if (in_interrupt())
+ BUG();
+ if (!PageHighMem(page))
+ return;
+ kunmap_high(page);
+}
+
+static void debug_kmap_atomic_prot(enum km_type type)
+{
+#ifdef CONFIG_DEBUG_HIGHMEM
+ static unsigned warn_count = 10;
+
+ if (unlikely(warn_count == 0))
+ return;
+
+ if (unlikely(in_interrupt())) {
+ if (in_irq()) {
+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
+ type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+ type != KM_BOUNCE_READ) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (!irqs_disabled()) { /* softirq */
+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
+ type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+ type != KM_SKB_SUNRPC_DATA &&
+ type != KM_SKB_DATA_SOFTIRQ &&
+ type != KM_BOUNCE_READ) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ }
+ }
+
+ if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+ if (!irqs_disabled()) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+ if (irq_count() == 0 && !irqs_disabled()) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ }
+#endif
+}
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap is is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+{
+ enum fixed_addresses idx;
+ unsigned long vaddr;
+
+ /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+ pagefault_disable();
+
+ if (!PageHighMem(page))
+ return page_address(page);
+
+ debug_kmap_atomic_prot(type);
+
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ BUG_ON(!pte_none(*(kmap_pte-idx)));
+ set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+}
+
+void *kmap_atomic(struct page *page, enum km_type type)
+{
+ return kmap_atomic_prot(page, type, kmap_prot);
+}
+
+void kunmap_atomic(void *kvaddr, enum km_type type)
+{
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+ enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+
+ /*
+ * Force other mappings to Oops if they'll try to access this pte
+ * without first remap it. Keeping stale mappings around is a bad idea
+ * also, in case the page changes cacheability attributes or becomes
+ * a protected page in a hypervisor.
+ */
+ if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+ kpte_clear_flush(kmap_pte-idx, vaddr);
+ else {
+#ifdef CONFIG_DEBUG_HIGHMEM
+ BUG_ON(vaddr < PAGE_OFFSET);
+ BUG_ON(vaddr >= (unsigned long)high_memory);
+#endif
+ }
+
+ arch_flush_lazy_mmu_mode();
+ pagefault_enable();
+}
+
+/* This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+{
+ enum fixed_addresses idx;
+ unsigned long vaddr;
+
+ pagefault_disable();
+
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void*) vaddr;
+}
+
+struct page *kmap_atomic_to_page(void *ptr)
+{
+ unsigned long idx, vaddr = (unsigned long)ptr;
+ pte_t *pte;
+
+ if (vaddr < FIXADDR_START)
+ return virt_to_page(ptr);
+
+ idx = virt_to_fix(vaddr);
+ pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+ return pte_page(*pte);
+}
+
+EXPORT_SYMBOL(kmap);
+EXPORT_SYMBOL(kunmap);
+EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic);
--- /dev/null
+/******************************************************************************
+ * mm/hypervisor.c
+ *
+ * Update page tables via the hypervisor.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/features.h>
+#include <xen/interface/memory.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/highmem.h>
+#include <asm/tlbflush.h>
+#include <linux/highmem.h>
+
+void xen_l1_entry_update(pte_t *ptr, pte_t val)
+{
+ mmu_update_t u;
+#ifdef CONFIG_HIGHPTE
+ u.ptr = ((unsigned long)ptr >= (unsigned long)high_memory) ?
+ arbitrary_virt_to_machine(ptr) : virt_to_machine(ptr);
+#else
+ u.ptr = virt_to_machine(ptr);
+#endif
+ u.val = __pte_val(val);
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_l1_entry_update);
+
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = __pmd_val(val);
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = __pud_val(val);
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = __pgd_val(val);
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif /* CONFIG_X86_64 */
+
+void xen_pt_switch(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_NEW_BASEPTR;
+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_new_user_pt(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_NEW_USER_BASEPTR;
+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush(void)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL(xen_tlb_flush);
+
+void xen_invlpg(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_INVLPG_LOCAL;
+ op.arg1.linear_addr = ptr & PAGE_MASK;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL(xen_invlpg);
+
+#ifdef CONFIG_SMP
+
+void xen_tlb_flush_all(void)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_TLB_FLUSH_ALL;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush_mask(cpumask_t *mask)
+{
+ struct mmuext_op op;
+ if ( cpus_empty(*mask) )
+ return;
+ op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_all(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_INVLPG_ALL;
+ op.arg1.linear_addr = ptr & PAGE_MASK;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
+{
+ struct mmuext_op op;
+ if ( cpus_empty(*mask) )
+ return;
+ op.cmd = MMUEXT_INVLPG_MULTI;
+ op.arg1.linear_addr = ptr & PAGE_MASK;
+ set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#endif /* CONFIG_SMP */
+
+void xen_pgd_pin(unsigned long ptr)
+{
+ struct mmuext_op op;
+#ifdef CONFIG_X86_64
+ op.cmd = MMUEXT_PIN_L4_TABLE;
+#elif defined(CONFIG_X86_PAE)
+ op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+ op.cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pgd_unpin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_UNPIN_TABLE;
+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_set_ldt(const void *ptr, unsigned int ents)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_SET_LDT;
+ op.arg1.linear_addr = (unsigned long)ptr;
+ op.arg2.nr_ents = ents;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+/*
+ * Bitmap is indexed by page number. If bit is set, the page is part of a
+ * xen_create_contiguous_region() area of memory.
+ */
+unsigned long *contiguous_bitmap;
+
+static void contiguous_bitmap_set(
+ unsigned long first_page, unsigned long nr_pages)
+{
+ unsigned long start_off, end_off, curr_idx, end_idx;
+
+ curr_idx = first_page / BITS_PER_LONG;
+ start_off = first_page & (BITS_PER_LONG-1);
+ end_idx = (first_page + nr_pages) / BITS_PER_LONG;
+ end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
+
+ if (curr_idx == end_idx) {
+ contiguous_bitmap[curr_idx] |=
+ ((1UL<<end_off)-1) & -(1UL<<start_off);
+ } else {
+ contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
+ while ( ++curr_idx < end_idx )
+ contiguous_bitmap[curr_idx] = ~0UL;
+ contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
+ }
+}
+
+static void contiguous_bitmap_clear(
+ unsigned long first_page, unsigned long nr_pages)
+{
+ unsigned long start_off, end_off, curr_idx, end_idx;
+
+ curr_idx = first_page / BITS_PER_LONG;
+ start_off = first_page & (BITS_PER_LONG-1);
+ end_idx = (first_page + nr_pages) / BITS_PER_LONG;
+ end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
+
+ if (curr_idx == end_idx) {
+ contiguous_bitmap[curr_idx] &=
+ -(1UL<<end_off) | ((1UL<<start_off)-1);
+ } else {
+ contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
+ while ( ++curr_idx != end_idx )
+ contiguous_bitmap[curr_idx] = 0;
+ contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
+ }
+}
+
+/* Protected by balloon_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+static unsigned long limited_frames[1<<MAX_CONTIG_ORDER];
+static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
+
+/* Ensure multi-page extents are contiguous in machine memory. */
+int xen_create_contiguous_region(
+ unsigned long vstart, unsigned int order, unsigned int address_bits)
+{
+ unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long frame, flags;
+ unsigned int i;
+ int rc, success;
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = 1UL << order,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = 1,
+ .extent_order = order,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ /*
+ * Currently an auto-translated guest will not perform I/O, nor will
+ * it require PAE page directories below 4GB. Therefore any calls to
+ * this function are redundant and can be ignored.
+ */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
+ set_xen_guest_handle(exchange.out.extent_start, &out_frame);
+
+ scrub_pages((void *)vstart, 1 << order);
+
+ balloon_lock(flags);
+
+ /* 1. Zap current PTEs, remembering MFNs. */
+ for (i = 0; i < (1U<<order); i++) {
+ in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+ __pte_ma(0), 0);
+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
+ INVALID_P2M_ENTRY);
+ }
+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+ BUG();
+
+ /* 2. Get a new contiguous memory extent. */
+ out_frame = __pa(vstart) >> PAGE_SHIFT;
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == (1UL << order));
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ /* Compatibility when XENMEM_exchange is unsupported. */
+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &exchange.in) != (1UL << order))
+ BUG();
+ success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+ &exchange.out) == 1);
+ if (!success) {
+ /* Couldn't get special memory: fall back to normal. */
+ for (i = 0; i < (1U<<order); i++)
+ in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+ &exchange.in) != (1UL<<order))
+ BUG();
+ }
+ }
+#endif
+
+ /* 3. Map the new extent in place of old pages. */
+ for (i = 0; i < (1U<<order); i++) {
+ frame = success ? (out_frame + i) : in_frames[i];
+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
+ }
+
+ cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
+ ? UVMF_TLB_FLUSH|UVMF_ALL
+ : UVMF_INVLPG|UVMF_ALL;
+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+ BUG();
+
+ if (success)
+ contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT,
+ 1UL << order);
+
+ balloon_unlock(flags);
+
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+ unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long frame, flags;
+ unsigned int i;
+ int rc, success;
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = 1,
+ .extent_order = order,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = 1UL << order,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ }
+ };
+
+ if (xen_feature(XENFEAT_auto_translated_physmap) ||
+ !test_bit(__pa(vstart) >> PAGE_SHIFT, contiguous_bitmap))
+ return;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return;
+
+ set_xen_guest_handle(exchange.in.extent_start, &in_frame);
+ set_xen_guest_handle(exchange.out.extent_start, out_frames);
+
+ scrub_pages((void *)vstart, 1 << order);
+
+ balloon_lock(flags);
+
+ contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
+
+ /* 1. Find start MFN of contiguous extent. */
+ in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
+
+ /* 2. Zap current PTEs. */
+ for (i = 0; i < (1U<<order); i++) {
+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+ __pte_ma(0), 0);
+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
+ INVALID_P2M_ENTRY);
+ out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
+ }
+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+ BUG();
+
+ /* 3. Do the exchange for non-contiguous MFNs. */
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == 1);
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ /* Compatibility when XENMEM_exchange is unsupported. */
+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &exchange.in) != 1)
+ BUG();
+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+ &exchange.out) != (1UL << order))
+ BUG();
+ success = 1;
+ }
+#endif
+
+ /* 4. Map new pages in place of old pages. */
+ for (i = 0; i < (1U<<order); i++) {
+ frame = success ? out_frames[i] : (in_frame + i);
+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
+ }
+
+ cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
+ ? UVMF_TLB_FLUSH|UVMF_ALL
+ : UVMF_INVLPG|UVMF_ALL;
+ if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+ BUG();
+
+ balloon_unlock(flags);
+
+ if (unlikely(!success)) {
+ /* Try hard to get the special memory back to Xen. */
+ exchange.in.extent_order = 0;
+ set_xen_guest_handle(exchange.in.extent_start, &in_frame);
+
+ for (i = 0; i < (1U<<order); i++) {
+ struct page *page = alloc_page(__GFP_HIGHMEM|__GFP_COLD);
+ unsigned long pfn;
+ mmu_update_t mmu;
+ unsigned int j = 0;
+
+ if (!page) {
+ printk(KERN_WARNING "Xen and kernel out of memory "
+ "while trying to release an order %u "
+ "contiguous region\n", order);
+ break;
+ }
+ pfn = page_to_pfn(page);
+
+ balloon_lock(flags);
+
+ if (!PageHighMem(page)) {
+ void *v = __va(pfn << PAGE_SHIFT);
+
+ scrub_pages(v, 1);
+ MULTI_update_va_mapping(cr_mcl + j, (unsigned long)v,
+ __pte_ma(0), UVMF_INVLPG|UVMF_ALL);
+ ++j;
+ }
+#ifdef CONFIG_XEN_SCRUB_PAGES
+ else {
+ scrub_pages(kmap(page), 1);
+ kunmap(page);
+ kmap_flush_unused();
+ }
+#endif
+
+ frame = pfn_to_mfn(pfn);
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+
+ MULTI_update_va_mapping(cr_mcl + j, vstart,
+ pfn_pte_ma(frame, PAGE_KERNEL),
+ UVMF_INVLPG|UVMF_ALL);
+ ++j;
+
+ pfn = __pa(vstart) >> PAGE_SHIFT;
+ set_phys_to_machine(pfn, frame);
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ mmu.ptr = ((uint64_t)frame << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+ mmu.val = pfn;
+ cr_mcl[j].op = __HYPERVISOR_mmu_update;
+ cr_mcl[j].args[0] = (unsigned long)&mmu;
+ cr_mcl[j].args[1] = 1;
+ cr_mcl[j].args[2] = 0;
+ cr_mcl[j].args[3] = DOMID_SELF;
+ ++j;
+ }
+
+ cr_mcl[j].op = __HYPERVISOR_memory_op;
+ cr_mcl[j].args[0] = XENMEM_decrease_reservation;
+ cr_mcl[j].args[1] = (unsigned long)&exchange.in;
+
+ if (HYPERVISOR_multicall(cr_mcl, j + 1))
+ BUG();
+ BUG_ON(cr_mcl[j].result != 1);
+ while (j--)
+ BUG_ON(cr_mcl[j].result != 0);
+
+ balloon_unlock(flags);
+
+ free_empty_pages(&page, 1);
+
+ in_frame++;
+ vstart += PAGE_SIZE;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+int xen_limit_pages_to_max_mfn(
+ struct page *pages, unsigned int order, unsigned int address_bits)
+{
+ unsigned long flags, frame;
+ unsigned long *in_frames = discontig_frames, *out_frames = limited_frames;
+ struct page *page;
+ unsigned int i, n, nr_mcl;
+ int rc, success;
+ DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER);
+
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .extent_order = 0,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ bitmap_zero(limit_map, 1U << order);
+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
+ set_xen_guest_handle(exchange.out.extent_start, out_frames);
+
+ /* 0. Scrub the pages. */
+ for (i = 0, n = 0; i < 1U<<order ; i++) {
+ page = &pages[i];
+ if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
+ continue;
+ __set_bit(i, limit_map);
+
+ if (!PageHighMem(page))
+ scrub_pages(page_address(page), 1);
+#ifdef CONFIG_XEN_SCRUB_PAGES
+ else {
+ scrub_pages(kmap(page), 1);
+ kunmap(page);
+ ++n;
+ }
+#endif
+ }
+ if (bitmap_empty(limit_map, 1U << order))
+ return 0;
+
+ if (n)
+ kmap_flush_unused();
+
+ balloon_lock(flags);
+
+ /* 1. Zap current PTEs (if any), remembering MFNs. */
+ for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
+ if(!test_bit(i, limit_map))
+ continue;
+ page = &pages[i];
+
+ out_frames[n] = page_to_pfn(page);
+ in_frames[n] = pfn_to_mfn(out_frames[n]);
+
+ if (!PageHighMem(page))
+ MULTI_update_va_mapping(cr_mcl + nr_mcl++,
+ (unsigned long)page_address(page),
+ __pte_ma(0), 0);
+
+ set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
+ ++n;
+ }
+ if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
+ BUG();
+
+ /* 2. Get new memory below the required limit. */
+ exchange.in.nr_extents = n;
+ exchange.out.nr_extents = n;
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == n);
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ /* Compatibility when XENMEM_exchange is unsupported. */
+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &exchange.in) != n)
+ BUG();
+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+ &exchange.out) != n)
+ BUG();
+ success = 1;
+ }
+#endif
+
+ /* 3. Map the new pages in place of old pages. */
+ for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
+ if(!test_bit(i, limit_map))
+ continue;
+ page = &pages[i];
+
+ frame = success ? out_frames[n] : in_frames[n];
+
+ if (!PageHighMem(page))
+ MULTI_update_va_mapping(cr_mcl + nr_mcl++,
+ (unsigned long)page_address(page),
+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
+
+ set_phys_to_machine(page_to_pfn(page), frame);
+ ++n;
+ }
+ if (nr_mcl) {
+ cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
+ ? UVMF_TLB_FLUSH|UVMF_ALL
+ : UVMF_INVLPG|UVMF_ALL;
+ if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
+ BUG();
+ }
+
+ balloon_unlock(flags);
+
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
+
+int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+{
+ maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
+ return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
+}
+
+#define MAX_BATCHED_FULL_PTES 32
+
+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ int dirty_accountable)
+{
+ int rc = 0, i = 0;
+ mmu_update_t u[MAX_BATCHED_FULL_PTES];
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
+ return 0;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ do {
+ if (pte_present(*pte)) {
+ pte_t ptent = pte_modify(*pte, newprot);
+
+ if (dirty_accountable && pte_dirty(ptent))
+ ptent = pte_mkwrite(ptent);
+ u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
+ | ((unsigned long)pte & ~PAGE_MASK)
+ | MMU_PT_UPDATE_PRESERVE_AD;
+ u[i].val = __pte_val(ptent);
+ if (++i == MAX_BATCHED_FULL_PTES) {
+ if ((rc = HYPERVISOR_mmu_update(
+ &u[0], i, NULL, DOMID_SELF)) != 0)
+ break;
+ i = 0;
+ }
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ if (i)
+ rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
+ pte_unmap_unlock(pte - 1, ptl);
+ BUG_ON(rc && rc != -ENOSYS);
+ return !rc;
+}
--- /dev/null
+/*
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+
+#include <asm/asm.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/bugs.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+#include <asm/hypervisor.h>
+#include <asm/swiotlb.h>
+#include <asm/setup.h>
+#include <asm/cacheflush.h>
+
+extern unsigned long *contiguous_bitmap;
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+unsigned long max_pfn_mapped;
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+unsigned long highstart_pfn, highend_pfn;
+
+static noinline int do_test_wp_bit(void);
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+ pud_t *pud;
+ pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+ if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
+ pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ pud = pud_offset(pgd, 0);
+ BUG_ON(pmd_table != pmd_offset(pud, 0));
+ }
+#endif
+ pud = pud_offset(pgd, 0);
+ pmd_table = pmd_offset(pud, 0);
+
+ return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry:
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (pmd_none(*pmd)) {
+#else
+ if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
+#endif
+ pte_t *page_table = NULL;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+#endif
+ if (!page_table) {
+ page_table =
+ (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ }
+
+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ make_lowmem_page_readonly(page_table,
+ XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+ BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+ }
+
+ return pte_offset_kernel(pmd, 0);
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
+ * checking the pgd every time.
+ */
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pmd_t *pmd;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
+ pgd = pgd_base + pgd_idx;
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+ pmd = one_md_table_init(pgd);
+ pmd = pmd + pmd_index(vaddr);
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd++, pmd_idx++) {
+ if (vaddr < hypervisor_virt_start)
+ one_page_table_init(pmd);
+
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+ if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
+ */
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+{
+ int pgd_idx, pmd_idx, pte_ofs;
+ unsigned long pfn;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ unsigned long max_ram_pfn = xen_start_info->nr_pages;
+ if (max_ram_pfn > max_low_pfn)
+ max_ram_pfn = max_low_pfn;
+
+ pgd_idx = pgd_index(PAGE_OFFSET);
+ pgd = pgd_base + pgd_idx;
+ pfn = 0;
+ pmd_idx = pmd_index(PAGE_OFFSET);
+ pte_ofs = pte_index(PAGE_OFFSET);
+
+ for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+#ifdef CONFIG_XEN
+ /*
+ * Native linux hasn't PAE-paging enabled yet at this
+ * point. When running as xen domain we are in PAE
+ * mode already, thus we can't simply hook a empty
+ * pmd. That would kill the mappings we are currently
+ * using ...
+ */
+ pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
+#else
+ pmd = one_md_table_init(pgd);
+#endif
+ if (pfn >= max_low_pfn)
+ continue;
+ pmd += pmd_idx;
+ for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+ pmd++, pmd_idx++) {
+ unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+ if (addr >= hypervisor_virt_start)
+ continue;
+
+ /*
+ * Map with big pages if possible, otherwise
+ * create normal page tables:
+ *
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+ unsigned int addr2;
+ pgprot_t prot = PAGE_KERNEL_LARGE;
+
+ addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+ PAGE_OFFSET + PAGE_SIZE-1;
+
+ if (is_kernel_text(addr) ||
+ is_kernel_text(addr2))
+ prot = PAGE_KERNEL_LARGE_EXEC;
+
+ set_pmd(pmd, pfn_pmd(pfn, prot));
+
+ pfn += PTRS_PER_PTE;
+ max_pfn_mapped = pfn;
+ continue;
+ }
+ pte = one_page_table_init(pmd);
+
+ for (pte += pte_ofs;
+ pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+ pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+ pgprot_t prot = PAGE_KERNEL;
+
+ /* XEN: Only map initial RAM allocation. */
+ if ((pfn >= max_ram_pfn) || pte_present(*pte))
+ continue;
+ if (is_kernel_text(addr))
+ prot = PAGE_KERNEL_EXEC;
+
+ set_pte(pte, pfn_pte(pfn, prot));
+ }
+ max_pfn_mapped = pfn;
+ pte_ofs = 0;
+ }
+ pmd_idx = 0;
+ }
+}
+
+#ifndef CONFIG_XEN
+
+static inline int page_kills_ppro(unsigned long pagenr)
+{
+ if (pagenr >= 0x70000 && pagenr <= 0x7003F)
+ return 1;
+ return 0;
+}
+
+#else
+
+#define page_kills_ppro(p) 0
+
+#endif
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (mfn_to_local_pfn(pagenr) >= max_pfn)
+ return 1;
+ return 0;
+}
+
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+ return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+ vaddr), vaddr), vaddr);
+}
+
+static void __init kmap_init(void)
+{
+ unsigned long kmap_vstart;
+
+ /*
+ * Cache the first kmap pte:
+ */
+ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+ kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+ kmap_prot = PAGE_KERNEL;
+}
+
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = PKMAP_BASE;
+ page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ pud = pud_offset(pgd, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ pte = pte_offset_kernel(pmd, vaddr);
+ pkmap_page_table = pte;
+}
+
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+{
+ if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+ ClearPageReserved(page);
+ init_page_count(page);
+ if (pfn < xen_start_info->nr_pages)
+ __free_page(page);
+ totalhigh_pages++;
+ } else
+ SetPageReserved(page);
+}
+
+#ifndef CONFIG_NUMA
+static void __init set_highmem_pages_init(int bad_ppro)
+{
+ int pfn;
+
+ for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
+ /*
+ * Holes under sparsemem might not have no mem_map[]:
+ */
+ if (pfn_valid(pfn))
+ add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+ }
+ totalram_pages += totalhigh_pages;
+}
+#endif /* !CONFIG_NUMA */
+
+#else
+# define kmap_init() do { } while (0)
+# define permanent_kmaps_init(pgd_base) do { } while (0)
+# define set_highmem_pages_init(bad_ppro) do { } while (0)
+#endif /* CONFIG_HIGHMEM */
+
+pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
+EXPORT_SYMBOL(__PAGE_KERNEL);
+
+pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
+
+pgd_t *swapper_pg_dir;
+
+static void __init xen_pagetable_setup_start(pgd_t *base)
+{
+}
+
+static void __init xen_pagetable_setup_done(pgd_t *base)
+{
+}
+
+/*
+ * Build a proper pagetable for the kernel mappings. Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/x86/kernel/head_32.S. The root of the
+ * pagetable will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir. In any case,
+ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+static void __init pagetable_init(void)
+{
+ pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
+ unsigned long vaddr, end;
+
+ xen_pagetable_setup_start(pgd_base);
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __PAGE_KERNEL |= _PAGE_GLOBAL;
+ __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
+ }
+
+ kernel_physical_mapping_init(pgd_base);
+ remap_numa_kva();
+
+ /*
+ * Fixed mappings, only the page table structure has to be
+ * created - mappings will be set by set_fixmap():
+ */
+ early_ioremap_clear();
+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+ end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+ page_table_range_init(vaddr, end, pgd_base);
+ early_ioremap_reset();
+
+ permanent_kmaps_init(pgd_base);
+
+ xen_pagetable_setup_done(pgd_base);
+}
+
+#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
+/*
+ * ACPI suspend needs this for resume, because things like the intel-agp
+ * driver might have split up a kernel 4MB mapping.
+ */
+char swsusp_pg_dir[PAGE_SIZE]
+ __attribute__ ((aligned(PAGE_SIZE)));
+
+static inline void save_pg_dir(void)
+{
+ memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+}
+#else /* !CONFIG_ACPI_SLEEP */
+static inline void save_pg_dir(void)
+{
+}
+#endif /* !CONFIG_ACPI_SLEEP */
+
+void zap_low_mappings(void)
+{
+ int i;
+
+ /*
+ * Zap initial low-memory mappings.
+ *
+ * Note that "pgd_clear()" doesn't do it for
+ * us, because pgd_clear() is a no-op on i386.
+ */
+ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
+ set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
+#else
+ set_pgd(swapper_pg_dir+i, __pgd(0));
+#endif
+ }
+ flush_tlb_all();
+}
+
+int nx_enabled;
+
+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+#ifdef CONFIG_X86_PAE
+
+static int disable_nx __initdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non executable mappings.
+ *
+ * on Enable
+ * off Disable
+ */
+static int __init noexec_setup(char *str)
+{
+ if (!str || !strcmp(str, "on")) {
+ if (cpu_has_nx) {
+ __supported_pte_mask |= _PAGE_NX;
+ disable_nx = 0;
+ }
+ } else {
+ if (!strcmp(str, "off")) {
+ disable_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+early_param("noexec", noexec_setup);
+
+static void __init set_nx(void)
+{
+ unsigned int v[4], l, h;
+
+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+ if ((v[3] & (1 << 20)) && !disable_nx) {
+ rdmsr(MSR_EFER, l, h);
+ l |= EFER_NX;
+ wrmsr(MSR_EFER, l, h);
+ nx_enabled = 1;
+ __supported_pte_mask |= _PAGE_NX;
+ }
+ }
+}
+#endif
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+ int i;
+
+#ifdef CONFIG_X86_PAE
+ set_nx();
+ if (nx_enabled)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+ pagetable_init();
+
+ __flush_tlb_all();
+
+ kmap_init();
+
+ /* Switch to the real shared_info page, and clear the
+ * dummy page. */
+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+ /* Setup mapping of lower 1st MB */
+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
+ if (is_initial_xendomain())
+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+ else
+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
+ virt_to_machine(empty_zero_page),
+ PAGE_KERNEL_RO);
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
+ */
+static void __init test_wp_bit(void)
+{
+ printk(KERN_INFO
+ "Checking if this processor honours the WP bit even in supervisor mode...");
+
+ /* Any page-aligned address will do, the test is non-destructive */
+ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+ boot_cpu_data.wp_works_ok = do_test_wp_bit();
+ clear_fixmap(FIX_WP_TEST);
+
+ if (!boot_cpu_data.wp_works_ok) {
+ printk(KERN_CONT "No.\n");
+#ifdef CONFIG_X86_WP_WORKS_OK
+ panic(
+ "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+#endif
+ } else {
+ printk(KERN_CONT "Ok.\n");
+ }
+}
+
+static struct kcore_list kcore_mem, kcore_vmalloc;
+
+void __init mem_init(void)
+{
+ int codesize, reservedpages, datasize, initsize;
+ int tmp, bad_ppro;
+ unsigned long pfn;
+
+ contiguous_bitmap = alloc_bootmem_low_pages(
+ (max_low_pfn + 2*BITS_PER_LONG) >> 3);
+ BUG_ON(!contiguous_bitmap);
+ memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3);
+
+ pci_iommu_alloc();
+
+#ifdef CONFIG_FLATMEM
+ BUG_ON(!mem_map);
+#endif
+ bad_ppro = ppro_with_ram_bug();
+
+#ifdef CONFIG_HIGHMEM
+ /* check that fixmap and pkmap do not overlap */
+ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+ printk(KERN_ERR
+ "fixmap and kmap areas overlap - this will crash\n");
+ printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
+ PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
+ FIXADDR_START);
+ BUG();
+ }
+#endif
+ /* this will put all low memory onto the freelists */
+ totalram_pages += free_all_bootmem();
+ /* XEN: init and count low-mem pages outside initial allocation. */
+ for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
+ ClearPageReserved(pfn_to_page(pfn));
+ init_page_count(pfn_to_page(pfn));
+ totalram_pages++;
+ }
+
+ reservedpages = 0;
+ for (tmp = 0; tmp < max_low_pfn; tmp++)
+ /*
+ * Only count reserved RAM pages:
+ */
+ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+ reservedpages++;
+
+ set_highmem_pages_init(bad_ppro);
+
+ codesize = (unsigned long) &_etext - (unsigned long) &_text;
+ datasize = (unsigned long) &_edata - (unsigned long) &_etext;
+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ VMALLOC_END-VMALLOC_START);
+
+ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
+ "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ num_physpages << (PAGE_SHIFT-10),
+ codesize >> 10,
+ reservedpages << (PAGE_SHIFT-10),
+ datasize >> 10,
+ initsize >> 10,
+ (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
+ );
+
+#if 1 /* double-sanity-check paranoia */
+ printk(KERN_INFO "virtual kernel memory layout:\n"
+ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+#endif
+ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
+ FIXADDR_START, FIXADDR_TOP,
+ (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+ (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+ VMALLOC_START, VMALLOC_END,
+ (VMALLOC_END - VMALLOC_START) >> 20,
+
+ (unsigned long)__va(0), (unsigned long)high_memory,
+ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+ (unsigned long)&__init_begin, (unsigned long)&__init_end,
+ ((unsigned long)&__init_end -
+ (unsigned long)&__init_begin) >> 10,
+
+ (unsigned long)&_etext, (unsigned long)&_edata,
+ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+ (unsigned long)&_text, (unsigned long)&_etext,
+ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+#ifdef CONFIG_HIGHMEM
+ BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+ BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON((unsigned long)high_memory > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+
+ if (boot_cpu_data.wp_works_ok < 0)
+ test_wp_bit();
+
+ cpa_init();
+ save_pg_dir();
+ zap_low_mappings();
+
+ SetPagePinned(virt_to_page(init_mm.pgd));
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+ struct pglist_data *pgdata = NODE_DATA(nid);
+ struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ return __add_pages(zone, start_pfn, nr_pages);
+}
+#endif
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section. Put this after the callers, so that it cannot be inlined.
+ */
+static noinline int do_test_wp_bit(void)
+{
+ char tmp_reg;
+ int flag;
+
+ __asm__ __volatile__(
+ " movb %0, %1 \n"
+ "1: movb %1, %0 \n"
+ " xorl %2, %2 \n"
+ "2: \n"
+ _ASM_EXTABLE(1b,2b)
+ :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+ "=q" (tmp_reg),
+ "=r" (flag)
+ :"2" (1)
+ :"memory");
+
+ return flag;
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+void mark_rodata_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ size >> 10);
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+ start, start+size);
+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
+ start += size;
+ size = (unsigned long)__end_rodata - start;
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ size >> 10);
+ rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
+}
+#endif
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+ begin, PAGE_ALIGN(end));
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+ unsigned long addr;
+
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable first.
+ */
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(addr));
+ init_page_count(virt_to_page(addr));
+ memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+ free_page(addr);
+ totalram_pages++;
+ }
+ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+#endif
+}
+
+void free_initmem(void)
+{
+ free_init_pages("unused kernel memory",
+ (unsigned long)(&__init_begin),
+ (unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_init_pages("initrd memory", start, end);
+}
+#endif
--- /dev/null
+/*
+ * linux/arch/x86_64/mm/init.c
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ *
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Modified for Xen.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
+#include <asm/cacheflush.h>
+
+#include <xen/features.h>
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+unsigned int __kernel_page_user;
+EXPORT_SYMBOL(__kernel_page_user);
+#endif
+
+int after_bootmem;
+
+extern unsigned long *contiguous_bitmap;
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+extern unsigned long start_pfn;
+
+extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
+extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+int direct_gbpages __meminitdata
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+#ifndef CONFIG_XEN
+static int __init parse_direct_gbpages_off(char *arg)
+{
+ direct_gbpages = 0;
+ return 0;
+}
+early_param("nogbpages", parse_direct_gbpages_off);
+
+static int __init parse_direct_gbpages_on(char *arg)
+{
+ direct_gbpages = 1;
+ return 0;
+}
+early_param("gbpages", parse_direct_gbpages_on);
+#endif
+
+/*
+ * Use this until direct mapping is established, i.e. before __va() is
+ * available in init_memory_mapping().
+ */
+
+#define addr_to_page(addr, page) \
+ (addr) &= PHYSICAL_PAGE_MASK; \
+ (page) = ((unsigned long *) ((unsigned long) \
+ (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
+ __START_KERNEL_map)))
+
+pmd_t *__init early_get_pmd(unsigned long va)
+{
+ unsigned long addr;
+ unsigned long *page = (unsigned long *)init_level4_pgt;
+
+ addr = page[pgd_index(va)];
+ addr_to_page(addr, page);
+
+ addr = page[pud_index(va)];
+ addr_to_page(addr, page);
+
+ return (pmd_t *)&page[pmd_index(va)];
+}
+
+void __meminit early_make_page_readonly(void *va, unsigned int feature)
+{
+ unsigned long addr, _va = (unsigned long)va;
+ pte_t pte, *ptep;
+ unsigned long *page = (unsigned long *) init_level4_pgt;
+
+ BUG_ON(after_bootmem);
+
+ if (xen_feature(feature))
+ return;
+
+ addr = (unsigned long) page[pgd_index(_va)];
+ addr_to_page(addr, page);
+
+ addr = page[pud_index(_va)];
+ addr_to_page(addr, page);
+
+ addr = page[pmd_index(_va)];
+ addr_to_page(addr, page);
+
+ ptep = (pte_t *) &page[pte_index(_va)];
+
+ pte.pte = ptep->pte & ~_PAGE_RW;
+ if (HYPERVISOR_update_va_mapping(_va, pte, 0))
+ BUG();
+}
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+void show_mem(void)
+{
+ long i, total = 0, reserved = 0;
+ long shared = 0, cached = 0;
+ struct page *page;
+ pg_data_t *pgdat;
+
+ printk(KERN_INFO "Mem-info:\n");
+ show_free_areas();
+ for_each_online_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+ /*
+ * This loop can take a while with 256 GB and
+ * 4k pages so defer the NMI watchdog:
+ */
+ if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
+ touch_nmi_watchdog();
+
+ if (!pfn_valid(pgdat->node_start_pfn + i))
+ continue;
+
+ page = pfn_to_page(pgdat->node_start_pfn + i);
+ total++;
+ if (PageReserved(page))
+ reserved++;
+ else if (PageSwapCache(page))
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+ }
+ }
+ printk(KERN_INFO "%lu pages of RAM\n", total);
+ printk(KERN_INFO "%lu reserved pages\n", reserved);
+ printk(KERN_INFO "%lu pages shared\n", shared);
+ printk(KERN_INFO "%lu pages swap cached\n", cached);
+}
+
+static unsigned long __meminitdata table_start;
+static unsigned long __meminitdata table_end;
+
+static __init void *spp_getpage(void)
+{
+ void *ptr;
+
+ if (after_bootmem)
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+ else if (start_pfn < table_end) {
+ ptr = __va(start_pfn << PAGE_SHIFT);
+ start_pfn++;
+ memset(ptr, 0, PAGE_SIZE);
+ } else
+ ptr = alloc_bootmem_pages(PAGE_SIZE);
+
+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+ panic("set_pte_phys: cannot allocate page data %s\n",
+ after_bootmem ? "after bootmem" : "");
+ }
+
+ pr_debug("spp_getpage %p\n", ptr);
+
+ return ptr;
+}
+
+#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
+#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
+
+static __init void
+set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, new_pte;
+
+ pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
+
+ pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
+ if (pgd_none(*pgd)) {
+ printk(KERN_ERR
+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ }
+ pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
+ if (pud_none(*pud)) {
+ pmd = (pmd_t *) spp_getpage();
+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+ if (pmd != pmd_offset(pud, 0)) {
+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ pmd, pmd_offset(pud, 0));
+ return;
+ }
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ pte = (pte_t *) spp_getpage();
+ make_page_readonly(pte, XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+ if (pte != pte_offset_kernel(pmd, 0)) {
+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
+ return;
+ }
+ }
+ if (pgprot_val(prot))
+ new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
+ else
+ new_pte = __pte(0);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (!pte_none(*pte) && __pte_val(new_pte) &&
+ __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
+ pte_ERROR(*pte);
+ set_pte(pte, new_pte);
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+static __init void
+set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, new_pte;
+
+ pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
+
+ pgd = pgd_offset_k(vaddr);
+ if (pgd_none(*pgd)) {
+ printk(KERN_ERR
+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ pmd = (pmd_t *) spp_getpage();
+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+ if (pmd != pmd_offset(pud, 0)) {
+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ pmd, pmd_offset(pud, 0));
+ }
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ pte = (pte_t *) spp_getpage();
+ make_page_readonly(pte, XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+ if (pte != pte_offset_kernel(pmd, 0)) {
+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
+ return;
+ }
+ }
+ new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (!pte_none(*pte) && __pte_val(new_pte) &&
+ /* __acpi_map_table() fails to properly call clear_fixmap() */
+ (vaddr < __fix_to_virt(FIX_ACPI_END) ||
+ vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
+ __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
+ pte_ERROR(*pte);
+ set_pte(pte, new_pte);
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * The head.S code sets up the kernel high mapping:
+ *
+ * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ *
+ * phys_addr holds the negative offset to the kernel, which is added
+ * to the compile time generated pmds. This results in invalid pmds up
+ * to the point where we hit the physaddr 0 mapping.
+ *
+ * We limit the mappings to the region from _text to _end. _end is
+ * rounded up to the 2MB boundary. This catches the invalid pmds as
+ * well, as they are located before _text:
+ */
+void __init cleanup_highmap(void)
+{
+ unsigned long vaddr = __START_KERNEL_map;
+ unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
+ pmd_t *pmd = level2_kernel_pgt;
+ pmd_t *last_pmd = pmd + PTRS_PER_PMD;
+
+ for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
+ if (pmd_none(*pmd))
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
+ }
+}
+#endif
+
+/* NOTE: this is meant to be run only at boot */
+void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+{
+ unsigned long address = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+ printk(KERN_ERR "Invalid __set_fixmap\n");
+ return;
+ }
+ switch (idx) {
+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+ set_pte_phys(address, phys, prot, 0);
+ set_pte_phys(address, phys, prot, 1);
+ break;
+ case FIX_EARLYCON_MEM_BASE:
+ xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
+ pfn_pte_ma(phys >> PAGE_SHIFT, prot));
+ break;
+ default:
+ set_pte_phys_ma(address, phys, prot);
+ break;
+ }
+}
+
+static __meminit void *alloc_static_page(unsigned long *phys)
+{
+ unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
+
+ if (after_bootmem) {
+ void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
+
+ *phys = __pa(adr);
+
+ return adr;
+ }
+
+ *phys = start_pfn << PAGE_SHIFT;
+ start_pfn++;
+ memset((void *)va, 0, PAGE_SIZE);
+ return (void *)va;
+}
+
+#define PTE_SIZE PAGE_SIZE
+
+static inline int __meminit make_readonly(unsigned long paddr)
+{
+ extern char __vsyscall_0;
+ int readonly = 0;
+
+ /* Make new page tables read-only. */
+ if (!xen_feature(XENFEAT_writable_page_tables)
+ && (paddr >= (table_start << PAGE_SHIFT))
+ && (paddr < (table_end << PAGE_SHIFT)))
+ readonly = 1;
+ /* Make old page tables read-only. */
+ if (!xen_feature(XENFEAT_writable_page_tables)
+ && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
+ && (paddr < (start_pfn << PAGE_SHIFT)))
+ readonly = 1;
+
+ /*
+ * No need for writable mapping of kernel image. This also ensures that
+ * page and descriptor tables embedded inside don't have writable
+ * mappings. Exclude the vsyscall area here, allowing alternative
+ * instruction patching to work.
+ */
+ if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
+ && !(paddr >= __pa_symbol(&__vsyscall_0)
+ && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
+ readonly = 1;
+
+ return readonly;
+}
+
+#ifndef CONFIG_XEN
+/* Must run before zap_low_mappings */
+__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+{
+ pmd_t *pmd, *last_pmd;
+ unsigned long vaddr;
+ int i, pmds;
+
+ pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+ vaddr = __START_KERNEL_map;
+ pmd = level2_kernel_pgt;
+ last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+
+ for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
+ for (i = 0; i < pmds; i++) {
+ if (pmd_present(pmd[i]))
+ goto continue_outer_loop;
+ }
+ vaddr += addr & ~PMD_MASK;
+ addr &= PMD_MASK;
+
+ for (i = 0; i < pmds; i++, addr += PMD_SIZE)
+ set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ __flush_tlb_all();
+
+ return (void *)vaddr;
+continue_outer_loop:
+ ;
+ }
+ printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+ return NULL;
+}
+
+/*
+ * To avoid virtual aliases later:
+ */
+__meminit void early_iounmap(void *addr, unsigned long size)
+{
+ unsigned long vaddr;
+ pmd_t *pmd;
+ int i, pmds;
+
+ vaddr = (unsigned long)addr;
+ pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+ pmd = level2_kernel_pgt + pmd_index(vaddr);
+
+ for (i = 0; i < pmds; i++)
+ pmd_clear(pmd + i);
+
+ __flush_tlb_all();
+}
+#endif
+
+static unsigned long __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+{
+ int i = pmd_index(address);
+
+ for (; i < PTRS_PER_PMD; i++) {
+ unsigned long pte_phys;
+ pmd_t *pmd = pmd_page + i;
+ pte_t *pte, *pte_save;
+ int k;
+
+ if (address >= end)
+ break;
+
+ if (__pmd_val(*pmd)) {
+ address += PMD_SIZE;
+ continue;
+ }
+
+ pte = alloc_static_page(&pte_phys);
+ pte_save = pte;
+ for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
+ unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
+
+ if (address >= (after_bootmem
+ ? end
+ : xen_start_info->nr_pages << PAGE_SHIFT))
+ pteval = 0;
+ else if (make_readonly(address))
+ pteval &= ~_PAGE_RW;
+ set_pte(pte, __pte(pteval & __supported_pte_mask));
+ }
+ if (!after_bootmem) {
+ early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
+ *pmd = __pmd(pte_phys | _KERNPG_TABLE);
+ } else {
+ make_page_readonly(pte_save, XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
+ }
+ }
+ return address;
+}
+
+static unsigned long __meminit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, 0);
+ unsigned long last_map_addr;
+
+ spin_lock(&init_mm.page_table_lock);
+ last_map_addr = phys_pmd_init(pmd, address, end);
+ spin_unlock(&init_mm.page_table_lock);
+ __flush_tlb_all();
+ return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+{
+ unsigned long last_map_addr = end;
+ int i = pud_index(addr);
+
+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
+ unsigned long pmd_phys;
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ if (addr >= end)
+ break;
+
+ if (__pud_val(*pud)) {
+ if (!pud_large(*pud))
+ last_map_addr = phys_pmd_update(pud, addr, end);
+ continue;
+ }
+
+ if (direct_gbpages) {
+ set_pte((pte_t *)pud,
+ pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
+ continue;
+ }
+
+ pmd = alloc_static_page(&pmd_phys);
+
+ spin_lock(&init_mm.page_table_lock);
+ *pud = __pud(pmd_phys | _KERNPG_TABLE);
+ last_map_addr = phys_pmd_init(pmd, addr, end);
+ spin_unlock(&init_mm.page_table_lock);
+
+ early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ }
+ __flush_tlb_all();
+
+ return last_map_addr >> PAGE_SHIFT;
+}
+
+void __init xen_init_pt(void)
+{
+ unsigned long addr, *page;
+
+ /* Find the initial pte page that was built for us. */
+ page = (unsigned long *)xen_start_info->pt_base;
+ addr = page[pgd_index(__START_KERNEL_map)];
+ addr_to_page(addr, page);
+ addr = page[pud_index(__START_KERNEL_map)];
+ addr_to_page(addr, page);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
+ in kernel PTEs. We check that here. */
+ if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
+ unsigned long *pg;
+ pte_t pte;
+
+ /* Mess with the initial mapping of page 0. It's not needed. */
+ BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
+ addr = page[pmd_index(__START_KERNEL_map)];
+ addr_to_page(addr, pg);
+ pte.pte = pg[pte_index(__START_KERNEL_map)];
+ BUG_ON(!(pte.pte & _PAGE_PRESENT));
+
+ /* If _PAGE_USER isn't set, we obviously do not need it. */
+ if (pte.pte & _PAGE_USER) {
+ /* _PAGE_USER is needed, but is it set implicitly? */
+ pte.pte &= ~_PAGE_USER;
+ if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
+ pte, 0) != 0) ||
+ !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
+ /* We need to explicitly specify _PAGE_USER. */
+ __kernel_page_user = _PAGE_USER;
+ }
+ }
+#endif
+
+ /* Construct mapping of initial pte page in our own directories. */
+ init_level4_pgt[pgd_index(__START_KERNEL_map)] =
+ __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
+ level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
+ __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
+ memcpy(level2_kernel_pgt, page, PAGE_SIZE);
+
+ __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
+ __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
+
+ /* Do an early initialization of the fixmap area. */
+ addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+ level3_kernel_pgt[pud_index(addr)] =
+ __pud(__pa_symbol(level2_fixmap_pgt) | _PAGE_TABLE);
+ level2_fixmap_pgt[pmd_index(addr)] =
+ __pmd(__pa_symbol(level1_fixmap_pgt) | _PAGE_TABLE);
+
+ early_make_page_readonly(init_level4_pgt,
+ XENFEAT_writable_page_tables);
+ early_make_page_readonly(__user_pgd(init_level4_pgt),
+ XENFEAT_writable_page_tables);
+ early_make_page_readonly(level3_kernel_pgt,
+ XENFEAT_writable_page_tables);
+ early_make_page_readonly(level3_user_pgt,
+ XENFEAT_writable_page_tables);
+ early_make_page_readonly(level2_kernel_pgt,
+ XENFEAT_writable_page_tables);
+ early_make_page_readonly(level2_fixmap_pgt,
+ XENFEAT_writable_page_tables);
+ early_make_page_readonly(level1_fixmap_pgt,
+ XENFEAT_writable_page_tables);
+
+ if (!xen_feature(XENFEAT_writable_page_tables)) {
+ xen_pgd_pin(__pa_symbol(init_level4_pgt));
+ xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt)));
+ }
+}
+
+static void __init extend_init_mapping(unsigned long tables_space)
+{
+ unsigned long va = __START_KERNEL_map;
+ unsigned long start = start_pfn;
+ unsigned long phys, addr, *pte_page;
+ pmd_t *pmd;
+ pte_t *pte, new_pte;
+ unsigned long *page = (unsigned long *)init_level4_pgt;
+
+ addr = page[pgd_index(va)];
+ addr_to_page(addr, page);
+ addr = page[pud_index(va)];
+ addr_to_page(addr, page);
+
+ /* Kill mapping of low 1MB. */
+ while (va < (unsigned long)&_text) {
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+
+ /* Ensure init mappings cover kernel text/data and initial tables. */
+ while (va < (__START_KERNEL_map
+ + (start_pfn << PAGE_SHIFT)
+ + tables_space)) {
+ pmd = (pmd_t *)&page[pmd_index(va)];
+ if (pmd_none(*pmd)) {
+ pte_page = alloc_static_page(&phys);
+ early_make_page_readonly(
+ pte_page, XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
+ } else {
+ addr = page[pmd_index(va)];
+ addr_to_page(addr, pte_page);
+ }
+ pte = (pte_t *)&pte_page[pte_index(va)];
+ if (pte_none(*pte)) {
+ new_pte = pfn_pte(
+ (va - __START_KERNEL_map) >> PAGE_SHIFT,
+ __pgprot(_KERNPG_TABLE));
+ xen_l1_entry_update(pte, new_pte);
+ }
+ va += PAGE_SIZE;
+ }
+
+ /* Finally, blow away any spurious initial mappings. */
+ while (1) {
+ pmd = (pmd_t *)&page[pmd_index(va)];
+ if (pmd_none(*pmd))
+ break;
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+
+ if (start_pfn > start)
+ reserve_early(start << PAGE_SHIFT,
+ start_pfn << PAGE_SHIFT, "INITMAP");
+}
+
+static void __init find_early_table_space(unsigned long end)
+{
+ unsigned long puds, pmds, ptes, tables;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables = round_up(puds * 8, PAGE_SIZE) +
+ round_up(pmds * 8, PAGE_SIZE) +
+ round_up(ptes * 8, PAGE_SIZE);
+
+ extend_init_mapping(tables);
+
+ table_start = start_pfn;
+ table_end = table_start + (tables>>PAGE_SHIFT);
+
+ early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_start << PAGE_SHIFT,
+ (table_start << PAGE_SHIFT) + tables);
+}
+
+static void __init xen_finish_init_mapping(void)
+{
+ unsigned long i, start, end;
+
+ /* Re-vector virtual addresses pointing into the initial
+ mapping to the just-established permanent ones. */
+ xen_start_info = __va(__pa(xen_start_info));
+ xen_start_info->pt_base = (unsigned long)
+ __va(__pa(xen_start_info->pt_base));
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ phys_to_machine_mapping =
+ __va(__pa(xen_start_info->mfn_list));
+ xen_start_info->mfn_list = (unsigned long)
+ phys_to_machine_mapping;
+ }
+ if (xen_start_info->mod_start)
+ xen_start_info->mod_start = (unsigned long)
+ __va(__pa(xen_start_info->mod_start));
+
+ /* Destroy the Xen-created mappings beyond the kernel image as
+ * well as the temporary mappings created above. Prevents
+ * overlap with modules area (if init mapping is very big).
+ */
+ start = PAGE_ALIGN((unsigned long)_end);
+ end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
+ for (; start < end; start += PAGE_SIZE)
+ if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
+ BUG();
+
+ /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
+ table_end = ~0UL;
+
+ /*
+ * Prefetch pte's for the bt_ioremap() area. It gets used before the
+ * boot-time allocator is online, so allocate-on-demand would fail.
+ */
+ early_ioremap_clear();
+ for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
+ __set_fixmap(i, 0, __pgprot(0));
+ early_ioremap_reset();
+
+ /* Switch to the real shared_info page, and clear the dummy page. */
+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+ /* Set up mapping of lowest 1MB of physical memory. */
+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
+ if (is_initial_xendomain())
+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+ else
+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
+ virt_to_mfn(empty_zero_page)
+ << PAGE_SHIFT,
+ PAGE_KERNEL_RO);
+
+ /* Disable the 'start_pfn' allocator. */
+ table_end = start_pfn;
+}
+
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+
+#ifdef CONFIG_MEMTEST_BOOTPARAM
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+ unsigned pattern)
+{
+ unsigned long i;
+ unsigned long *start;
+ unsigned long start_bad;
+ unsigned long last_bad;
+ unsigned long val;
+ unsigned long start_phys_aligned;
+ unsigned long count;
+ unsigned long incr;
+
+ switch (pattern) {
+ case 0:
+ val = 0UL;
+ break;
+ case 1:
+ val = -1UL;
+ break;
+ case 2:
+ val = 0x5555555555555555UL;
+ break;
+ case 3:
+ val = 0xaaaaaaaaaaaaaaaaUL;
+ break;
+ default:
+ return;
+ }
+
+ incr = sizeof(unsigned long);
+ start_phys_aligned = ALIGN(start_phys, incr);
+ count = (size - (start_phys_aligned - start_phys))/incr;
+ start = __va(start_phys_aligned);
+ start_bad = 0;
+ last_bad = 0;
+
+ for (i = 0; i < count; i++)
+ start[i] = val;
+ for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+ if (*start != val) {
+ if (start_phys_aligned == last_bad + incr) {
+ last_bad += incr;
+ } else {
+ if (start_bad) {
+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
+ val, start_bad, last_bad + incr);
+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ }
+ start_bad = last_bad = start_phys_aligned;
+ }
+ }
+ }
+ if (start_bad) {
+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
+ val, start_bad, last_bad + incr);
+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ }
+
+}
+
+static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+
+static int __init parse_memtest(char *arg)
+{
+ if (arg)
+ memtest_pattern = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+ u64 t_start, t_size;
+ unsigned pattern;
+
+ if (!memtest_pattern)
+ return;
+
+ printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+ for (pattern = 0; pattern < memtest_pattern; pattern++) {
+ t_start = start;
+ t_size = 0;
+ while (t_start < end) {
+ t_start = find_e820_area_size(t_start, &t_size, 1);
+
+ /* done ? */
+ if (t_start >= end)
+ break;
+ if (t_start + t_size > end)
+ t_size = end - t_start;
+
+ printk(KERN_CONT "\n %016llx - %016llx pattern %d",
+ (unsigned long long)t_start,
+ (unsigned long long)t_start + t_size, pattern);
+
+ memtest(t_start, t_size, pattern);
+
+ t_start += t_size;
+ }
+ }
+ printk(KERN_CONT "\n");
+}
+#else
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+{
+ unsigned long next, last_map_addr = end;
+ unsigned long start_phys = start, end_phys = end;
+
+ printk(KERN_INFO "init_memory_mapping\n");
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ *
+ * Later we should allocate these tables in the local node of the
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
+ */
+ if (!after_bootmem) {
+ init_gbpages();
+ find_early_table_space(end);
+ }
+
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ for (; start < end; start = next) {
+ pgd_t *pgd = pgd_offset_k(start);
+ unsigned long pud_phys;
+ pud_t *pud;
+
+ if (after_bootmem)
+ pud = pud_offset(pgd, start & PGDIR_MASK);
+ else
+ pud = alloc_static_page(&pud_phys);
+ next = start + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
+ if (!after_bootmem) {
+ early_make_page_readonly(pud, XENFEAT_writable_page_tables);
+ set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+ }
+ }
+
+ if (!after_bootmem) {
+ BUG_ON(start_pfn != table_end);
+ xen_finish_init_mapping();
+ }
+
+ __flush_tlb_all();
+
+ if (!after_bootmem)
+ reserve_early(table_start << PAGE_SHIFT,
+ table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start_phys, end_phys);
+
+ return last_map_addr;
+}
+
+#ifndef CONFIG_NUMA
+void __init paging_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+ max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+ max_zone_pfns[ZONE_NORMAL] = end_pfn;
+
+ memory_present(0, 0, end_pfn);
+ sparse_init();
+ free_area_init_nodes(max_zone_pfns);
+
+ SetPagePinned(virt_to_page(init_mm.pgd));
+}
+#endif
+
+/*
+ * Memory hotplug specific functions
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+ unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
+
+ last_mapped_pfn = init_memory_mapping(start, start + size-1);
+ if (last_mapped_pfn > max_pfn_mapped)
+ max_pfn_mapped = last_mapped_pfn;
+
+ ret = __add_pages(zone, start_pfn, nr_pages);
+ WARN_ON(1);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+
+#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
+int memory_add_physaddr_to_nid(u64 start)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (mfn_to_local_pfn(pagenr) >= max_pfn)
+ return 1;
+ return 0;
+}
+
+
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
+ kcore_modules, kcore_vsyscall;
+
+void __init mem_init(void)
+{
+ long codesize, reservedpages, datasize, initsize;
+ unsigned long pfn;
+
+ contiguous_bitmap = alloc_bootmem_low_pages(
+ (end_pfn + 2*BITS_PER_LONG) >> 3);
+ BUG_ON(!contiguous_bitmap);
+ memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
+
+ pci_iommu_alloc();
+
+ /* clear_bss() already clear the empty_zero_page */
+
+ reservedpages = 0;
+
+ /* this will put all low memory onto the freelists */
+#ifdef CONFIG_NUMA
+ totalram_pages = numa_free_all_bootmem();
+#else
+ totalram_pages = free_all_bootmem();
+#endif
+ /* XEN: init and count pages outside initial allocation. */
+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+ ClearPageReserved(pfn_to_page(pfn));
+ init_page_count(pfn_to_page(pfn));
+ totalram_pages++;
+ }
+ reservedpages = end_pfn - totalram_pages -
+ absent_pages_in_range(0, end_pfn);
+ after_bootmem = 1;
+
+ codesize = (unsigned long) &_etext - (unsigned long) &_text;
+ datasize = (unsigned long) &_edata - (unsigned long) &_etext;
+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+ /* Register memory areas for /proc/kcore */
+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ VMALLOC_END-VMALLOC_START);
+ kclist_add(&kcore_kernel, &_stext, _end - _stext);
+ kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
+ VSYSCALL_END - VSYSCALL_START);
+
+ printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
+ "%ldk reserved, %ldk data, %ldk init)\n",
+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ end_pfn << (PAGE_SHIFT-10),
+ codesize >> 10,
+ reservedpages << (PAGE_SHIFT-10),
+ datasize >> 10,
+ initsize >> 10);
+
+ cpa_init();
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+ unsigned long addr = begin;
+
+ if (addr >= end)
+ return;
+
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+ begin, PAGE_ALIGN(end));
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
+ for (; addr < end; addr += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(addr));
+ init_page_count(virt_to_page(addr));
+ memset((void *)(addr & ~(PAGE_SIZE-1)),
+ POISON_FREE_INITMEM, PAGE_SIZE);
+ if (addr >= __START_KERNEL_map) {
+ /* make_readonly() reports all kernel addresses. */
+ if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
+ pfn_pte(__pa(addr) >> PAGE_SHIFT,
+ PAGE_KERNEL),
+ 0))
+ BUG();
+ if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+ BUG();
+ }
+ free_page(addr);
+ totalram_pages++;
+ }
+#endif
+}
+
+void free_initmem(void)
+{
+ free_init_pages("unused kernel memory",
+ (unsigned long)(&__init_begin),
+ (unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+void mark_rodata_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ (end - start) >> 10);
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+ /*
+ * The rodata section (but not the kernel text!) should also be
+ * not-executable.
+ */
+ start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+
+ rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+ set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: again\n");
+ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
+}
+
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_init_pages("initrd memory", start, end);
+}
+#endif
+
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+{
+#ifdef CONFIG_NUMA
+ int nid, next_nid;
+#endif
+ unsigned long pfn = phys >> PAGE_SHIFT;
+
+ if (pfn >= end_pfn) {
+ /*
+ * This can happen with kdump kernels when accessing
+ * firmware tables:
+ */
+ if (pfn < max_pfn_mapped)
+ return;
+
+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+ phys, len);
+ return;
+ }
+
+ /* Should check here against the e820 map to avoid double free */
+#ifdef CONFIG_NUMA
+ nid = phys_to_nid(phys);
+ next_nid = phys_to_nid(phys + len - 1);
+ if (nid == next_nid)
+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ else
+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+#else
+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+#endif
+
+#ifndef CONFIG_XEN
+ if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
+ static unsigned long dma_reserve __initdata;
+
+ dma_reserve += len / PAGE_SIZE;
+ set_dma_reserve(dma_reserve);
+ }
+#endif
+}
+
+int kern_addr_valid(unsigned long addr)
+{
+ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (above != 0 && above != -1UL)
+ return 0;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return 0;
+
+ if (pmd_large(*pmd))
+ return pfn_valid(pmd_pfn(*pmd));
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ return 0;
+
+ return pfn_valid(pte_pfn(*pte));
+}
+
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
+static struct vm_area_struct gate_vma = {
+ .vm_start = VSYSCALL_START,
+ .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+ .vm_page_prot = PAGE_READONLY_EXEC,
+ .vm_flags = VM_READ | VM_EXEC
+};
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (test_tsk_thread_flag(tsk, TIF_IA32))
+ return NULL;
+#endif
+ return &gate_vma;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+ struct vm_area_struct *vma = get_gate_vma(task);
+
+ if (!vma)
+ return 0;
+
+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/*
+ * Use this when you have no reliable task/vma, typically from interrupt
+ * context. It is less reliable than using the task's vma and may give
+ * false positives:
+ */
+int in_gate_area_no_task(unsigned long addr)
+{
+ return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+ return "[vdso]";
+ if (vma == &gate_vma)
+ return "[vsyscall]";
+ return NULL;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
+int __meminit
+vmemmap_populate(struct page *start_page, unsigned long size, int node)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ for (; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(pgd, addr, node);
+ if (!pud)
+ return -ENOMEM;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ pte_t entry;
+ void *p;
+
+ p = vmemmap_alloc_block(PMD_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
+ } else {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ }
+ }
+ return 0;
+}
+
+void __meminit vmemmap_populate_print_last(void)
+{
+ if (p_start) {
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ p_start = NULL;
+ p_end = NULL;
+ node_start = 0;
+ }
+}
+#endif
--- /dev/null
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/pat.h>
+
+#ifdef CONFIG_X86_64
+
+#ifndef CONFIG_XEN
+unsigned long __phys_addr(unsigned long x)
+{
+ if (x >= __START_KERNEL_map)
+ return x - __START_KERNEL_map + phys_base;
+ return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return addr < (1UL << boot_cpu_data.x86_phys_bits);
+}
+
+#else
+
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return 1;
+}
+
+#endif
+
+static int direct_remap_area_pte_fn(pte_t *pte,
+ struct page *pmd_page,
+ unsigned long address,
+ void *data)
+{
+ mmu_update_t **v = (mmu_update_t **)data;
+
+ BUG_ON(!pte_none(*pte));
+
+ (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+ (*v)++;
+
+ return 0;
+}
+
+static int __direct_remap_pfn_range(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid)
+{
+ int rc;
+ unsigned long i, start_address;
+ mmu_update_t *u, *v, *w;
+
+ u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ if (u == NULL)
+ return -ENOMEM;
+
+ start_address = address;
+
+ flush_cache_all();
+
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
+ /* Flush a full batch after filling in the PTE ptrs. */
+ rc = apply_to_page_range(mm, start_address,
+ address - start_address,
+ direct_remap_area_pte_fn, &w);
+ if (rc)
+ goto out;
+ rc = -EFAULT;
+ if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
+ goto out;
+ v = w = u;
+ start_address = address;
+ }
+
+ /*
+ * Fill in the machine address: PTE ptr is done later by
+ * apply_to_page_range().
+ */
+ v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
+
+ mfn++;
+ address += PAGE_SIZE;
+ v++;
+ }
+
+ if (v != u) {
+ /* Final batch. */
+ rc = apply_to_page_range(mm, start_address,
+ address - start_address,
+ direct_remap_area_pte_fn, &w);
+ if (rc)
+ goto out;
+ rc = -EFAULT;
+ if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
+ goto out;
+ }
+
+ rc = 0;
+
+ out:
+ flush_tlb_all();
+
+ free_page((unsigned long)u);
+
+ return rc;
+}
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+ unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return remap_pfn_range(vma, address, mfn, size, prot);
+
+ if (domid == DOMID_SELF)
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+ vma->vm_mm->context.has_foreign_mappings = 1;
+
+ return __direct_remap_pfn_range(
+ vma->vm_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_remap_pfn_range);
+
+int direct_kernel_remap_pfn_range(unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid)
+{
+ return __direct_remap_pfn_range(
+ &init_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
+
+static int lookup_pte_fn(
+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+ uint64_t *ptep = (uint64_t *)data;
+ if (ptep)
+ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+ return 0;
+}
+
+int create_lookup_pte_addr(struct mm_struct *mm,
+ unsigned long address,
+ uint64_t *ptep)
+{
+ return apply_to_page_range(mm, address, PAGE_SIZE,
+ lookup_pte_fn, ptep);
+}
+
+EXPORT_SYMBOL(create_lookup_pte_addr);
+
+static int noop_fn(
+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+ return 0;
+}
+
+int touch_pte_range(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long size)
+{
+ return apply_to_page_range(mm, address, size, noop_fn, NULL);
+}
+
+EXPORT_SYMBOL(touch_pte_range);
+
+#ifdef CONFIG_X86_32
+int __init page_is_ram(unsigned long pagenr)
+{
+ resource_size_t addr, end;
+ int i;
+
+#ifndef CONFIG_XEN
+ /*
+ * A special case is the first 4Kb of memory;
+ * This is a BIOS owned area, not kernel ram, but generally
+ * not listed as such in the E820 table.
+ */
+ if (pagenr == 0)
+ return 0;
+
+ /*
+ * Second special case: Some BIOSen report the PC BIOS
+ * area (640->1Mb) as ram even though it is not.
+ */
+ if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
+ pagenr < (BIOS_END >> PAGE_SHIFT))
+ return 0;
+#endif
+
+ for (i = 0; i < e820.nr_map; i++) {
+ /*
+ * Not usable memory:
+ */
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
+ end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
+
+
+ if ((pagenr >= addr) && (pagenr < end))
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ unsigned long prot_val)
+{
+ unsigned long nrpages = size >> PAGE_SHIFT;
+ int err;
+
+ switch (prot_val) {
+ case _PAGE_CACHE_UC:
+ default:
+ err = _set_memory_uc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_WC:
+ err = _set_memory_wc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_WB:
+ err = _set_memory_wb(vaddr, nrpages);
+ break;
+ }
+
+ return err;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+ unsigned long size, unsigned long prot_val, void *caller)
+{
+ unsigned long mfn, offset, vaddr;
+ resource_size_t last_addr;
+ struct vm_struct *area;
+ unsigned long new_prot_val;
+ pgprot_t prot;
+ int retval;
+ domid_t domid = DOMID_IO;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr)
+ return NULL;
+
+ if (!phys_addr_valid(phys_addr)) {
+ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+ (unsigned long long)phys_addr);
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ /*
+ * Don't remap the low PCI/ISA area, it's always mapped..
+ */
+ if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
+ return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
+ /*
+ * Don't allow anybody to remap normal RAM that we're using..
+ */
+ for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
+ unsigned long pfn = mfn_to_local_pfn(mfn);
+
+ if (pfn_valid(pfn)) {
+ if (!PageReserved(pfn_to_page(pfn)))
+ return NULL;
+ domid = DOMID_SELF;
+ }
+ }
+ WARN_ON_ONCE(domid == DOMID_SELF);
+
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+ retval = reserve_memtype(phys_addr, phys_addr + size,
+ prot_val, &new_prot_val);
+ if (retval) {
+ pr_debug("Warning: reserve_memtype returned %d\n", retval);
+ return NULL;
+ }
+
+ if (prot_val != new_prot_val) {
+ /*
+ * Do not fallback to certain memory types with certain
+ * requested type:
+ * - request is uc-, return cannot be write-back
+ * - request is uc-, return cannot be write-combine
+ * - request is write-combine, return cannot be write-back
+ */
+ if ((prot_val == _PAGE_CACHE_UC_MINUS &&
+ (new_prot_val == _PAGE_CACHE_WB ||
+ new_prot_val == _PAGE_CACHE_WC)) ||
+ (prot_val == _PAGE_CACHE_WC &&
+ new_prot_val == _PAGE_CACHE_WB)) {
+ pr_debug(
+ "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+ (unsigned long long)phys_addr,
+ (unsigned long long)(phys_addr + size),
+ prot_val, new_prot_val);
+ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
+ }
+ prot_val = new_prot_val;
+ }
+
+ switch (prot_val) {
+ case _PAGE_CACHE_UC:
+ default:
+ prot = PAGE_KERNEL_NOCACHE;
+ break;
+ case _PAGE_CACHE_UC_MINUS:
+ prot = PAGE_KERNEL_UC_MINUS;
+ break;
+ case _PAGE_CACHE_WC:
+ prot = PAGE_KERNEL_WC;
+ break;
+ case _PAGE_CACHE_WB:
+ prot = PAGE_KERNEL;
+ break;
+ }
+
+ /*
+ * Ok, go for it..
+ */
+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
+ if (!area)
+ return NULL;
+ area->phys_addr = phys_addr;
+ vaddr = (unsigned long) area->addr;
+ if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
+ size, prot, domid)) {
+ free_memtype(phys_addr, phys_addr + size);
+ free_vm_area(area);
+ return NULL;
+ }
+
+ if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+ free_memtype(phys_addr, phys_addr + size);
+ vunmap(area->addr);
+ return NULL;
+ }
+
+ return (void __iomem *) (vaddr + offset);
+}
+
+/**
+ * ioremap_nocache - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+{
+ /*
+ * Ideally, this should be:
+ * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+ *
+ * Till we fix all X drivers to use ioremap_wc(), we will use
+ * UC MINUS.
+ */
+ unsigned long val = _PAGE_CACHE_UC_MINUS;
+
+ return __ioremap_caller(phys_addr, size, val,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * ioremap_wc - map memory into CPU space write combined
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+{
+ if (pat_wc_enabled)
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+ __builtin_return_address(0));
+ else
+ return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+#ifndef CONFIG_XEN
+static void __iomem *ioremap_default(resource_size_t phys_addr,
+ unsigned long size)
+{
+ unsigned long flags;
+ void *ret;
+ int err;
+
+ /*
+ * - WB for WB-able memory and no other conflicting mappings
+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+ * - Inherit from confliting mappings otherwise
+ */
+ err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+ if (err < 0)
+ return NULL;
+
+ ret = (void *) __ioremap_caller(phys_addr, size, flags,
+ __builtin_return_address(0));
+
+ free_memtype(phys_addr, phys_addr + size);
+ return (void __iomem *)ret;
+}
+#endif
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+ struct vm_struct *p, *o;
+
+ if ((void __force *)addr <= high_memory)
+ return;
+
+ /*
+ * __ioremap special-cases the PCI/ISA range by not instantiating a
+ * vm_area and by simply returning an address into the kernel mapping
+ * of ISA space. So handle that here.
+ */
+ if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+ return;
+
+ addr = (volatile void __iomem *)
+ (PAGE_MASK & (unsigned long __force)addr);
+
+ /* Use the vm area unlocked, assuming the caller
+ ensures there isn't another iounmap for the same address
+ in parallel. Reuse of the virtual address is prevented by
+ leaving it in the global lists until we're done with it.
+ cpa takes care of the direct mappings. */
+ read_lock(&vmlist_lock);
+ for (p = vmlist; p; p = p->next) {
+ if (p->addr == addr)
+ break;
+ }
+ read_unlock(&vmlist_lock);
+
+ if (!p) {
+ printk(KERN_ERR "iounmap: bad address %p\n", addr);
+ dump_stack();
+ return;
+ }
+
+ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
+ /* Finally remove it */
+ o = remove_vm_area((void *)addr);
+ BUG_ON(p != o || o == NULL);
+ kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+#ifndef CONFIG_XEN
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+ void *addr;
+ unsigned long start = phys & PAGE_MASK;
+
+ /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+ if (page_is_ram(start >> PAGE_SHIFT))
+ return __va(phys);
+
+ addr = (void *)ioremap_default(start, PAGE_SIZE);
+ if (addr)
+ addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+ return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+ if (page_is_ram(phys >> PAGE_SHIFT))
+ return;
+
+ iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+ return;
+}
+#endif
+
+int __initdata early_ioremap_debug;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+ early_ioremap_debug = 1;
+
+ return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static __initdata int after_paging_init;
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+ __section(.bss.page_aligned);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+{
+ /* Don't assume we're using swapper_pg_dir at this point */
+ pgd_t *base = __va(read_cr3());
+ pgd_t *pgd = &base[pgd_index(addr)];
+ pud_t *pud = pud_offset(pgd, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ return pmd;
+}
+#else
+#define early_ioremap_pmd early_get_pmd
+#undef make_lowmem_page_readonly
+#define make_lowmem_page_readonly early_make_page_readonly
+#endif
+
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+{
+ return &bm_pte[pte_index(addr)];
+}
+
+void __init early_ioremap_init(void)
+{
+ pmd_t *pmd;
+
+ if (early_ioremap_debug)
+ printk(KERN_INFO "early_ioremap_init()\n");
+
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+ memset(bm_pte, 0, sizeof(bm_pte));
+ make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+ /*
+ * The boot-ioremap range spans multiple pmds, for which
+ * we are not prepared:
+ */
+ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+ WARN_ON(1);
+ printk(KERN_WARNING "pmd %p != %p\n",
+ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+ fix_to_virt(FIX_BTMAP_BEGIN));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
+ fix_to_virt(FIX_BTMAP_END));
+
+ printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
+ printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
+ FIX_BTMAP_BEGIN);
+ }
+}
+
+void __init early_ioremap_clear(void)
+{
+ pmd_t *pmd;
+
+ if (early_ioremap_debug)
+ printk(KERN_INFO "early_ioremap_clear()\n");
+
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+ pmd_clear(pmd);
+ make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
+ /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
+ __flush_tlb_all();
+}
+
+void __init early_ioremap_reset(void)
+{
+ enum fixed_addresses idx;
+ unsigned long addr, phys;
+ pte_t *pte;
+
+ after_paging_init = 1;
+ for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
+ addr = fix_to_virt(idx);
+ pte = early_ioremap_pte(addr);
+ if (pte_present(*pte)) {
+ phys = __pte_val(*pte) & PAGE_MASK;
+ set_fixmap(idx, phys);
+ }
+ }
+}
+
+static void __init __early_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags)
+{
+ unsigned long addr = __fix_to_virt(idx);
+ pte_t *pte;
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ pte = early_ioremap_pte(addr);
+ if (pgprot_val(flags))
+ set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+ else
+ pte_clear(NULL, addr, pte);
+ __flush_tlb_one(addr);
+}
+
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys)
+{
+ if (after_paging_init)
+ set_fixmap(idx, phys);
+ else
+ __early_set_fixmap(idx, phys, PAGE_KERNEL);
+}
+
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
+{
+ if (after_paging_init)
+ clear_fixmap(idx);
+ else
+ __early_set_fixmap(idx, 0, __pgprot(0));
+}
+
+
+int __initdata early_ioremap_nested;
+
+static int __init check_early_ioremap_leak(void)
+{
+ if (!early_ioremap_nested)
+ return 0;
+
+ printk(KERN_WARNING
+ "Debug warning: early ioremap leak of %d areas detected.\n",
+ early_ioremap_nested);
+ printk(KERN_WARNING
+ "please boot with early_ioremap_debug and report the dmesg.\n");
+ WARN_ON(1);
+
+ return 1;
+}
+late_initcall(check_early_ioremap_leak);
+
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+{
+ unsigned long offset, last_addr;
+ unsigned int nrpages, nesting;
+ enum fixed_addresses idx0, idx;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ nesting = early_ioremap_nested;
+ if (early_ioremap_debug) {
+ printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
+ phys_addr, size, nesting);
+ dump_stack();
+ }
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ if (nesting >= FIX_BTMAPS_NESTING) {
+ WARN_ON(1);
+ return NULL;
+ }
+ early_ioremap_nested++;
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr) - phys_addr;
+
+ /*
+ * Mappings have to fit in the FIX_BTMAP area.
+ */
+ nrpages = size >> PAGE_SHIFT;
+ if (nrpages > NR_FIX_BTMAPS) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ /*
+ * Ok, go for it..
+ */
+ idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ idx = idx0;
+ while (nrpages > 0) {
+ early_set_fixmap(idx, phys_addr);
+ phys_addr += PAGE_SIZE;
+ --idx;
+ --nrpages;
+ }
+ if (early_ioremap_debug)
+ printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+
+ return (void *) (offset + fix_to_virt(idx0));
+}
+
+void __init early_iounmap(void *addr, unsigned long size)
+{
+ unsigned long virt_addr;
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int nesting;
+
+ nesting = --early_ioremap_nested;
+ if (WARN_ON(nesting < 0))
+ return;
+
+ if (early_ioremap_debug) {
+ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+ size, nesting);
+ dump_stack();
+ }
+
+ virt_addr = (unsigned long)addr;
+ if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
+ WARN_ON(1);
+ return;
+ }
+ offset = virt_addr & ~PAGE_MASK;
+ nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ while (nrpages > 0) {
+ early_clear_fixmap(idx);
+ --idx;
+ --nrpages;
+ }
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+ WARN_ON(1);
+}
--- /dev/null
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pat.h>
+#include <asm/mmu_context.h>
+
+#ifndef CONFIG_X86_64
+#define TASK_SIZE64 TASK_SIZE
+#endif
+
+static void _pin_lock(struct mm_struct *mm, int lock) {
+ if (lock)
+ spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+ /* While mm->page_table_lock protects us against insertions and
+ * removals of higher level page table pages, it doesn't protect
+ * against updates of pte-s. Such updates, however, require the
+ * pte pages to be in consistent state (unpinned+writable or
+ * pinned+readonly). The pinning and attribute changes, however
+ * cannot be done atomically, which is why such updates must be
+ * prevented from happening concurrently.
+ * Note that no pte lock can ever elsewhere be acquired nesting
+ * with an already acquired one in the same mm, or with the mm's
+ * page_table_lock already acquired, as that would break in the
+ * non-split case (where all these are actually resolving to the
+ * one page_table_lock). Thus acquiring all of them here is not
+ * going to result in dead locks, and the order of acquires
+ * doesn't matter.
+ */
+ {
+ pgd_t *pgd = mm->pgd;
+ unsigned g;
+
+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+ pud_t *pud;
+ unsigned u;
+
+ if (pgd_none(*pgd))
+ continue;
+ pud = pud_offset(pgd, 0);
+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+ pmd_t *pmd;
+ unsigned m;
+
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, 0);
+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+ spinlock_t *ptl;
+
+ if (pmd_none(*pmd))
+ continue;
+ ptl = pte_lockptr(0, pmd);
+ if (lock)
+ spin_lock(ptl);
+ else
+ spin_unlock(ptl);
+ }
+ }
+ }
+ }
+#endif
+ if (!lock)
+ spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
+#define PIN_BATCH sizeof(void *)
+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+
+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
+ unsigned int cpu, unsigned int seq)
+{
+ unsigned long pfn = page_to_pfn(page);
+
+ if (PageHighMem(page)) {
+ if (pgprot_val(flags) & _PAGE_RW)
+ ClearPagePinned(page);
+ else
+ SetPagePinned(page);
+ } else {
+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte(pfn, flags), 0);
+ if (unlikely(++seq == PIN_BATCH)) {
+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+ PIN_BATCH, NULL)))
+ BUG();
+ seq = 0;
+ }
+ }
+
+ return seq;
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+ pgd_t *pgd = pgd_base;
+ pud_t *pud;
+ pmd_t *pmd;
+ int g,u,m;
+ unsigned int cpu, seq;
+ multicall_entry_t *mcl;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ cpu = get_cpu();
+
+ /*
+ * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
+ * may not be the 'current' task's pagetables (e.g., current may be
+ * 32-bit, but the pagetables may be for a 64-bit task).
+ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
+ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
+ */
+ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+ if (pgd_none(*pgd))
+ continue;
+ pud = pud_offset(pgd, 0);
+ if (PTRS_PER_PUD > 1) /* not folded */
+ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, 0);
+ if (PTRS_PER_PMD > 1) /* not folded */
+ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+ if (pmd_none(*pmd))
+ continue;
+ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
+ }
+ }
+ }
+
+ mcl = per_cpu(pb_mcl, cpu);
+#ifdef CONFIG_X86_64
+ if (unlikely(seq > PIN_BATCH - 2)) {
+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
+ BUG();
+ seq = 0;
+ }
+ MULTI_update_va_mapping(mcl + seq,
+ (unsigned long)__user_pgd(pgd_base),
+ pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
+ 0);
+ MULTI_update_va_mapping(mcl + seq + 1,
+ (unsigned long)pgd_base,
+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+ UVMF_TLB_FLUSH);
+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
+ BUG();
+#else
+ if (likely(seq != 0)) {
+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+ (unsigned long)pgd_base,
+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+ UVMF_TLB_FLUSH);
+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+ seq + 1, NULL)))
+ BUG();
+ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+ UVMF_TLB_FLUSH))
+ BUG();
+#endif
+
+ put_cpu();
+}
+
+static void __pgd_pin(pgd_t *pgd)
+{
+ pgd_walk(pgd, PAGE_KERNEL_RO);
+ kmap_flush_unused();
+ xen_pgd_pin(__pa(pgd)); /* kernel */
+#ifdef CONFIG_X86_64
+ xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
+#endif
+ SetPagePinned(virt_to_page(pgd));
+}
+
+static void __pgd_unpin(pgd_t *pgd)
+{
+ xen_pgd_unpin(__pa(pgd));
+#ifdef CONFIG_X86_64
+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
+#endif
+ pgd_walk(pgd, PAGE_KERNEL);
+ ClearPagePinned(virt_to_page(pgd));
+}
+
+void pgd_test_and_unpin(pgd_t *pgd)
+{
+ if (PagePinned(virt_to_page(pgd)))
+ __pgd_unpin(pgd);
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+ if (xen_feature(XENFEAT_writable_page_tables))
+ return;
+
+ pin_lock(mm);
+ __pgd_pin(mm->pgd);
+ pin_unlock(mm);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+ if (xen_feature(XENFEAT_writable_page_tables))
+ return;
+
+ pin_lock(mm);
+ __pgd_unpin(mm->pgd);
+ pin_unlock(mm);
+}
+
+void mm_pin_all(void)
+{
+ struct page *page;
+ unsigned long flags;
+
+ if (xen_feature(XENFEAT_writable_page_tables))
+ return;
+
+ /*
+ * Allow uninterrupted access to the pgd_list. Also protects
+ * __pgd_pin() by disabling preemption.
+ * All other CPUs must be at a safe point (e.g., in stop_machine
+ * or offlined entirely).
+ */
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (!PagePinned(page))
+ __pgd_pin((pgd_t *)page_address(page));
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ if (!PagePinned(virt_to_page(mm->pgd)))
+ mm_pin(mm);
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+
+ /*
+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+ */
+ if (tsk->active_mm == mm) {
+ tsk->active_mm = &init_mm;
+ atomic_inc(&init_mm.mm_count);
+
+ switch_mm(mm, &init_mm, tsk);
+
+ atomic_dec(&mm->mm_count);
+ BUG_ON(atomic_read(&mm->mm_count) == 0);
+ }
+
+ task_unlock(tsk);
+
+ if (PagePinned(virt_to_page(mm->pgd))
+ && atomic_read(&mm->mm_count) == 1
+ && !mm->context.has_foreign_mappings)
+ mm_unpin(mm);
+}
+
+/* blktap and gntdev need this, as otherwise they would implicitly (and
+ * needlessly, as they never use it) reference init_mm. */
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, int full)
+{
+ return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
+}
+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+ unsigned long vaddr;
+ pgprot_t mask_set;
+ pgprot_t mask_clr;
+ int numpages;
+ int flushtlb;
+ unsigned long pfn;
+ unsigned force_split : 1;
+};
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+ return __pa(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+ return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @addr: virtual start address
+ * @size: number of bytes to flush
+ *
+ * clflush is an unordered instruction which needs fencing with mfence
+ * to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+ void *vend = vaddr + size - 1;
+
+ mb();
+
+ for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+ clflush(vaddr);
+ /*
+ * Flush any possible final partial cacheline:
+ */
+ clflush(vend);
+
+ mb();
+}
+
+static void __cpa_flush_all(void *arg)
+{
+ unsigned long cache = (unsigned long)arg;
+
+ /*
+ * Flush all to work around Errata in early athlons regarding
+ * large page flushing.
+ */
+ __flush_tlb_all();
+
+ if (cache && boot_cpu_data.x86_model >= 4)
+ wbinvd();
+}
+
+static void cpa_flush_all(unsigned long cache)
+{
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+ /*
+ * We could optimize that further and do individual per page
+ * tlb invalidates for a low number of pages. Caveat: we must
+ * flush the high aliases on 64bit as well.
+ */
+ __flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+ unsigned int i, level;
+ unsigned long addr;
+
+ BUG_ON(irqs_disabled());
+ WARN_ON(PAGE_ALIGN(start) != start);
+
+ on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+
+ if (!cache)
+ return;
+
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+ pte_t *pte = lookup_address(addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *) addr, PAGE_SIZE);
+ }
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
+ unsigned long pfn)
+{
+ pgprot_t forbidden = __pgprot(0);
+
+#ifndef CONFIG_XEN
+ /*
+ * The BIOS area between 640k and 1Mb needs to be executable for
+ * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+ */
+ if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+ pgprot_val(forbidden) |= _PAGE_NX;
+#endif
+
+ /*
+ * The kernel text needs to be executable for obvious reasons
+ * Does not cover __inittext since that is gone later on. On
+ * 64bit we do not enforce !NX on the low mapping
+ */
+ if (within(address, (unsigned long)_text, (unsigned long)_etext))
+ pgprot_val(forbidden) |= _PAGE_NX;
+
+ /*
+ * The .rodata section needs to be read-only. Using the pfn
+ * catches all aliases.
+ */
+ if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
+ __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+ pgprot_val(forbidden) |= _PAGE_RW;
+
+ prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+ return prot;
+}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+ pgd_t *pgd = pgd_offset_k(address);
+ pud_t *pud;
+ pmd_t *pmd;
+
+ *level = PG_LEVEL_NONE;
+
+ if (pgd_none(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud))
+ return NULL;
+
+ *level = PG_LEVEL_1G;
+ if (pud_large(*pud) || !pud_present(*pud))
+ return (pte_t *)pud;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return NULL;
+
+ *level = PG_LEVEL_2M;
+ if (pmd_large(*pmd) || !pmd_present(*pmd))
+ return (pte_t *)pmd;
+
+ *level = PG_LEVEL_4K;
+
+ return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
+static void __set_pmd_pte(pte_t *kpte, unsigned long address,
+ unsigned int level, pte_t pte)
+{
+ /* change init_mm */
+ switch(level) {
+ case PG_LEVEL_2M:
+ xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
+ break;
+#ifdef CONFIG_X86_64
+ case PG_LEVEL_1G:
+ xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
+ break;
+#endif
+ default:
+ BUG();
+ }
+#ifdef CONFIG_X86_32
+ if (!SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ pud = pud_offset(pgd, address);
+ pmd = pmd_offset(pud, address);
+ xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
+ }
+ }
+#endif
+}
+
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
+{
+ unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+ pte_t new_pte, old_pte, *tmp;
+ pgprot_t old_prot, new_prot;
+ int i, do_split = 1;
+ unsigned int level;
+
+ if (cpa->force_split)
+ return 1;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ /*
+ * Check for races, another CPU might have split this page
+ * up already:
+ */
+ tmp = lookup_address(address, &level);
+ if (tmp != kpte)
+ goto out_unlock;
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ psize = PMD_PAGE_SIZE;
+ pmask = PMD_PAGE_MASK;
+ break;
+#ifdef CONFIG_X86_64
+ case PG_LEVEL_1G:
+ psize = PUD_PAGE_SIZE;
+ pmask = PUD_PAGE_MASK;
+ break;
+#endif
+ default:
+ do_split = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Calculate the number of pages, which fit into this large
+ * page starting at address:
+ */
+ nextpage_addr = (address + psize) & pmask;
+ numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+ if (numpages < cpa->numpages)
+ cpa->numpages = numpages;
+
+ /*
+ * We are safe now. Check whether the new pgprot is the same:
+ */
+ old_pte = *kpte;
+ old_prot = new_prot = pte_pgprot(old_pte);
+
+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+ /*
+ * old_pte points to the large page base address. So we need
+ * to add the offset of the virtual address:
+ */
+ pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
+ cpa->pfn = pfn;
+
+ new_prot = static_protections(new_prot, address, pfn);
+
+ /*
+ * We need to check the full range, whether
+ * static_protection() requires a different pgprot for one of
+ * the pages in the range we try to preserve:
+ */
+ if (pfn < max_mapnr) {
+ addr = address + PAGE_SIZE;
+ for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
+ i++, addr += PAGE_SIZE) {
+ pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+
+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * If there are no changes, return. maxpages has been updated
+ * above:
+ */
+ if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+ do_split = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * We need to change the attributes. Check, whether we can
+ * change the large page in one go. We request a split, when
+ * the address is not aligned and the number of pages is
+ * smaller than the number of pages in the large page. Note
+ * that we limited the number of possible pages already to
+ * the number of pages in the large page.
+ */
+ if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+ /*
+ * The address is aligned and the number of pages
+ * covers the full page.
+ */
+ new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
+ __set_pmd_pte(kpte, address, level, new_pte);
+ cpa->flushtlb = 1;
+ do_split = 0;
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&pgd_lock, flags);
+
+ return do_split;
+}
+
+static LIST_HEAD(page_pool);
+static unsigned long pool_size, pool_pages, pool_low;
+static unsigned long pool_used, pool_failed;
+
+static void cpa_fill_pool(struct page **ret)
+{
+ gfp_t gfp = GFP_KERNEL;
+ unsigned long flags;
+ struct page *p;
+
+ /*
+ * Avoid recursion (on debug-pagealloc) and also signal
+ * our priority to get to these pagetables:
+ */
+ if (current->flags & PF_MEMALLOC)
+ return;
+ current->flags |= PF_MEMALLOC;
+
+ /*
+ * Allocate atomically from atomic contexts:
+ */
+ if (in_atomic() || irqs_disabled() || debug_pagealloc)
+ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
+
+ while (pool_pages < pool_size || (ret && !*ret)) {
+ p = alloc_pages(gfp, 0);
+ if (!p) {
+ pool_failed++;
+ break;
+ }
+ /*
+ * If the call site needs a page right now, provide it:
+ */
+ if (ret && !*ret) {
+ *ret = p;
+ continue;
+ }
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_add(&p->lru, &page_pool);
+ pool_pages++;
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+
+ current->flags &= ~PF_MEMALLOC;
+}
+
+#define SHIFT_MB (20 - PAGE_SHIFT)
+#define ROUND_MB_GB ((1 << 10) - 1)
+#define SHIFT_MB_GB 10
+#define POOL_PAGES_PER_GB 16
+
+void __init cpa_init(void)
+{
+ struct sysinfo si;
+ unsigned long gb;
+
+ si_meminfo(&si);
+ /*
+ * Calculate the number of pool pages:
+ *
+ * Convert totalram (nr of pages) to MiB and round to the next
+ * GiB. Shift MiB to Gib and multiply the result by
+ * POOL_PAGES_PER_GB:
+ */
+ if (debug_pagealloc) {
+ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+ pool_size = POOL_PAGES_PER_GB * gb;
+ } else {
+ pool_size = 1;
+ }
+ pool_low = pool_size;
+
+ cpa_fill_pool(NULL);
+ printk(KERN_DEBUG
+ "CPA: page pool initialized %lu of %lu pages preallocated\n",
+ pool_pages, pool_size);
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+ unsigned long flags, mfn, mfninc = 1;
+ unsigned int i, level;
+ pte_t *pbase, *tmp;
+ pgprot_t ref_prot;
+ struct page *base;
+
+ /*
+ * Get a page from the pool. The pool list is protected by the
+ * pgd_lock, which we have to take anyway for the split
+ * operation:
+ */
+ spin_lock_irqsave(&pgd_lock, flags);
+ if (list_empty(&page_pool)) {
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ base = NULL;
+ cpa_fill_pool(&base);
+ if (!base)
+ return -ENOMEM;
+ spin_lock_irqsave(&pgd_lock, flags);
+ } else {
+ base = list_first_entry(&page_pool, struct page, lru);
+ list_del(&base->lru);
+ pool_pages--;
+
+ if (pool_pages < pool_low)
+ pool_low = pool_pages;
+ }
+
+ /*
+ * Check for races, another CPU might have split this page
+ * up for us already:
+ */
+ tmp = lookup_address(address, &level);
+ if (tmp != kpte)
+ goto out_unlock;
+
+ pbase = (pte_t *)page_address(base);
+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+ ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+
+#ifdef CONFIG_X86_64
+ if (level == PG_LEVEL_1G) {
+ mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+ pgprot_val(ref_prot) |= _PAGE_PSE;
+ }
+#endif
+
+ /*
+ * Get the target mfn from the original entry:
+ */
+ mfn = __pte_mfn(*kpte);
+ for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
+ set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
+
+ /*
+ * Install the new, split up pagetable. Important details here:
+ *
+ * On Intel the NX bit of all levels must be cleared to make a
+ * page executable. See section 4.13.2 of Intel 64 and IA-32
+ * Architectures Software Developer's Manual).
+ *
+ * Mark the entry present. The current mapping might be
+ * set to not present, which we preserved above.
+ */
+ if (HYPERVISOR_update_va_mapping((unsigned long)pbase,
+ mk_pte(base, PAGE_KERNEL_RO), 0))
+ BUG();
+ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+ pgprot_val(ref_prot) |= _PAGE_PRESENT;
+ __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
+ base = NULL;
+
+out_unlock:
+ /*
+ * If we dropped out via the lookup_address check under
+ * pgd_lock then stick the page back into the pool:
+ */
+ if (base) {
+ list_add(&base->lru, &page_pool);
+ pool_pages++;
+ } else
+ pool_used++;
+ spin_unlock_irqrestore(&pgd_lock, flags);
+
+ return 0;
+}
+
+static int __change_page_attr(struct cpa_data *cpa, int primary)
+{
+ unsigned long address = cpa->vaddr;
+ int do_split, err;
+ unsigned int level;
+ pte_t *kpte, old_pte;
+
+repeat:
+ kpte = lookup_address(address, &level);
+ if (!kpte)
+ return 0;
+
+ old_pte = *kpte;
+ if (!__pte_val(old_pte)) {
+ if (!primary)
+ return 0;
+ printk(KERN_WARNING "CPA: called for zero pte. "
+ "vaddr = %lx cpa->vaddr = %lx\n", address,
+ cpa->vaddr);
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ pte_t new_pte;
+ pgprot_t new_prot = pte_pgprot(old_pte);
+ unsigned long mfn = __pte_mfn(old_pte);
+
+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+ new_prot = static_protections(new_prot, address,
+ mfn_to_local_pfn(mfn));
+
+ /*
+ * We need to keep the mfn from the existing PTE,
+ * after all we're only going to change it's attributes
+ * not the memory it points to
+ */
+ new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
+ cpa->pfn = mfn_to_local_pfn(mfn);
+ /*
+ * Do we really change anything ?
+ */
+ if (__pte_val(old_pte) != __pte_val(new_pte)) {
+ set_pte_atomic(kpte, new_pte);
+ cpa->flushtlb = 1;
+ }
+ cpa->numpages = 1;
+ return 0;
+ }
+
+ /*
+ * Check, whether we can keep the large page intact
+ * and just change the pte:
+ */
+ do_split = try_preserve_large_page(kpte, address, cpa);
+ /*
+ * When the range fits into the existing large page,
+ * return. cp->numpages and cpa->tlbflush have been updated in
+ * try_large_page:
+ */
+ if (do_split <= 0)
+ return do_split;
+
+ /*
+ * We have to split the large page:
+ */
+ err = split_large_page(kpte, address);
+ if (!err) {
+ cpa->flushtlb = 1;
+ goto repeat;
+ }
+
+ return err;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+
+static int cpa_process_alias(struct cpa_data *cpa)
+{
+ struct cpa_data alias_cpa;
+ int ret = 0;
+
+ if (cpa->pfn > max_pfn_mapped)
+ return 0;
+
+ /*
+ * No need to redo, when the primary call touched the direct
+ * mapping already:
+ */
+ if (!within(cpa->vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+
+ ret = __change_page_attr_set_clr(&alias_cpa, 0);
+ }
+
+#ifdef CONFIG_X86_64
+ if (ret)
+ return ret;
+ /*
+ * No need to redo, when the primary call touched the high
+ * mapping already:
+ */
+ if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
+ return 0;
+
+ /*
+ * If the physical address is inside the kernel map, we need
+ * to touch the high mapped kernel as well:
+ */
+ if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
+ return 0;
+
+ alias_cpa = *cpa;
+ alias_cpa.vaddr =
+ (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
+
+ /*
+ * The high mapping range is imprecise, so ignore the return value.
+ */
+ __change_page_attr_set_clr(&alias_cpa, 0);
+#endif
+ return ret;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+{
+ int ret, numpages = cpa->numpages;
+
+ while (numpages) {
+ /*
+ * Store the remaining nr of pages for the large page
+ * preservation check.
+ */
+ cpa->numpages = numpages;
+
+ ret = __change_page_attr(cpa, checkalias);
+ if (ret)
+ return ret;
+
+ if (checkalias) {
+ ret = cpa_process_alias(cpa);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Adjust the number of pages with the result of the
+ * CPA operation. Either a large page has been
+ * preserved or a single page update happened.
+ */
+ BUG_ON(cpa->numpages > numpages);
+ numpages -= cpa->numpages;
+ cpa->vaddr += cpa->numpages * PAGE_SIZE;
+ }
+ return 0;
+}
+
+static inline int cache_attr(pgprot_t attr)
+{
+ return pgprot_val(attr) &
+ (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
+}
+
+static int change_page_attr_set_clr(unsigned long addr, int numpages,
+ pgprot_t mask_set, pgprot_t mask_clr,
+ int force_split)
+{
+ struct cpa_data cpa;
+ int ret, cache, checkalias;
+
+ /*
+ * Check, if we are requested to change a not supported
+ * feature:
+ */
+ mask_set = canon_pgprot(mask_set);
+ mask_clr = canon_pgprot(mask_clr);
+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
+ return 0;
+
+ /* Ensure we are PAGE_SIZE aligned */
+ if (addr & ~PAGE_MASK) {
+ addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
+
+ cpa.vaddr = addr;
+ cpa.numpages = numpages;
+ cpa.mask_set = mask_set;
+ cpa.mask_clr = mask_clr;
+ cpa.flushtlb = 0;
+ cpa.force_split = force_split;
+
+ /* No alias checking for _NX bit modifications */
+ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+
+ ret = __change_page_attr_set_clr(&cpa, checkalias);
+
+ /*
+ * Check whether we really changed something:
+ */
+ if (!cpa.flushtlb)
+ goto out;
+
+ /*
+ * No need to flush, when we did not set any of the caching
+ * attributes:
+ */
+ cache = cache_attr(mask_set);
+
+ /*
+ * On success we use clflush, when the CPU supports it to
+ * avoid the wbindv. If the CPU does not support it and in the
+ * error case we fall back to cpa_flush_all (which uses
+ * wbindv):
+ */
+ if (!ret && cpu_has_clflush)
+ cpa_flush_range(addr, numpages, cache);
+ else
+ cpa_flush_all(cache);
+
+out:
+ cpa_fill_pool(NULL);
+
+ return ret;
+}
+
+static inline int change_page_attr_set(unsigned long addr, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
+}
+
+static inline int change_page_attr_clear(unsigned long addr, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
+}
+
+int _set_memory_uc(unsigned long addr, int numpages)
+{
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ */
+ return change_page_attr_set(addr, numpages,
+ __pgprot(_PAGE_CACHE_UC_MINUS));
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ */
+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ _PAGE_CACHE_UC_MINUS, NULL))
+ return -EINVAL;
+
+ return _set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages,
+ __pgprot(_PAGE_CACHE_WC));
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+ if (!pat_wc_enabled)
+ return set_memory_uc(addr, numpages);
+
+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ _PAGE_CACHE_WC, NULL))
+ return -EINVAL;
+
+ return _set_memory_wc(addr, numpages);
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wb(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages,
+ __pgprot(_PAGE_CACHE_MASK));
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+ free_memtype(addr, addr + numpages * PAGE_SIZE);
+
+ return _set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+}
+
+int set_memory_4k(unsigned long addr, int numpages)
+{
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0),
+ __pgprot(0), 1);
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_rw(addr, numpages);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ .numpages = numpages,
+ .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(0)};
+
+ return __change_page_attr_set_clr(&cpa, 1);
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
+
+ return __change_page_attr_set_clr(&cpa, 1);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (PageHighMem(page))
+ return;
+ if (!enable) {
+ debug_check_no_locks_freed(page_address(page),
+ numpages * PAGE_SIZE);
+ }
+
+ /*
+ * If page allocator is not up yet then do not call c_p_a():
+ */
+ if (!debug_pagealloc_enabled)
+ return;
+
+ /*
+ * The return value is ignored as the calls cannot fail.
+ * Large pages are kept enabled at boot time, and are
+ * split up quickly with DEBUG_PAGEALLOC. If a splitup
+ * fails here (due to temporary memory shortage) no damage
+ * is done because we just keep the largepage intact up
+ * to the next attempt when it will likely be split up:
+ */
+ if (enable)
+ __set_pages_p(page, numpages);
+ else
+ __set_pages_np(page, numpages);
+
+ /*
+ * We should perform an IPI and flush all tlbs,
+ * but that can deadlock->flush only current cpu:
+ */
+ __flush_tlb_all();
+
+ /*
+ * Try to refill the page pool here. We can do this only after
+ * the tlb flush.
+ */
+ cpa_fill_pool(NULL);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int dpa_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "DEBUG_PAGEALLOC\n");
+ seq_printf(m, "pool_size : %lu\n", pool_size);
+ seq_printf(m, "pool_pages : %lu\n", pool_pages);
+ seq_printf(m, "pool_low : %lu\n", pool_low);
+ seq_printf(m, "pool_used : %lu\n", pool_used);
+ seq_printf(m, "pool_failed : %lu\n", pool_failed);
+
+ return 0;
+}
+
+static int dpa_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, dpa_show, NULL);
+}
+
+static const struct file_operations dpa_fops = {
+ .open = dpa_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init debug_pagealloc_proc_init(void)
+{
+ struct dentry *de;
+
+ de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
+ &dpa_fops);
+ if (!de)
+ return -ENOMEM;
+
+ return 0;
+}
+__initcall(debug_pagealloc_proc_init);
+#endif
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+ unsigned int level;
+ pte_t *pte;
+
+ if (PageHighMem(page))
+ return false;
+
+ pte = lookup_address((unsigned long)page_address(page), &level);
+ return (__pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static inline int in_secondary_range(unsigned long va)
+{
+#ifdef CONFIG_X86_64
+ return va >= VMALLOC_START && va < VMALLOC_END;
+#else
+ return va >= (unsigned long)high_memory;
+#endif
+}
+
+static void __make_page_readonly(unsigned long va)
+{
+ pte_t *pte;
+ unsigned int level;
+
+ pte = lookup_address(va, &level);
+ BUG_ON(!pte || level != PG_LEVEL_4K);
+ if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
+ BUG();
+ if (in_secondary_range(va)) {
+ unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+ if (pfn >= highstart_pfn)
+ kmap_flush_unused(); /* flush stale writable kmaps */
+ else
+#endif
+ __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
+ }
+}
+
+static void __make_page_writable(unsigned long va)
+{
+ pte_t *pte;
+ unsigned int level;
+
+ pte = lookup_address(va, &level);
+ BUG_ON(!pte || level != PG_LEVEL_4K);
+ if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
+ BUG();
+ if (in_secondary_range(va)) {
+ unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+ if (pfn < highstart_pfn)
+#endif
+ __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
+ }
+}
+
+void make_page_readonly(void *va, unsigned int feature)
+{
+ if (!xen_feature(feature))
+ __make_page_readonly((unsigned long)va);
+}
+
+void make_page_writable(void *va, unsigned int feature)
+{
+ if (!xen_feature(feature))
+ __make_page_writable((unsigned long)va);
+}
+
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
+{
+ unsigned long addr;
+
+ if (xen_feature(feature))
+ return;
+
+ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+ __make_page_readonly(addr);
+}
+
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
+{
+ unsigned long addr;
+
+ if (xen_feature(feature))
+ return;
+
+ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+ __make_page_writable(addr);
+}
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
--- /dev/null
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/bootmem.h>
+
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+#include <asm/e820.h>
+#include <asm/cacheflush.h>
+#include <asm/fcntl.h>
+#include <asm/mtrr.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_X86_PAT
+int __read_mostly pat_wc_enabled = 1;
+
+void __cpuinit pat_disable(char *reason)
+{
+ pat_wc_enabled = 0;
+ printk(KERN_INFO "%s\n", reason);
+}
+
+static int __init nopat(char *str)
+{
+ pat_disable("PAT support disabled.");
+ return 0;
+}
+early_param("nopat", nopat);
+#endif
+
+static u64 __read_mostly boot_pat_state;
+
+enum {
+ PAT_UC = 0, /* uncached */
+ PAT_WC = 1, /* Write combining */
+ PAT_WT = 4, /* Write Through */
+ PAT_WP = 5, /* Write Protected */
+ PAT_WB = 6, /* Write Back (default) */
+ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+ u64 pat;
+
+ if (!pat_wc_enabled)
+ return;
+
+ /* Paranoia check. */
+ if (!cpu_has_pat) {
+ printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
+ /*
+ * Panic if this happens on the secondary CPU, and we
+ * switched to PAT on the boot CPU. We have no way to
+ * undo PAT.
+ */
+ BUG_ON(boot_pat_state);
+ }
+
+#ifndef CONFIG_XEN
+ /* Set PWT to Write-Combining. All other bits stay the same */
+ /*
+ * PTE encoding used in Linux:
+ * PAT
+ * |PCD
+ * ||PWT
+ * |||
+ * 000 WB _PAGE_CACHE_WB
+ * 001 WC _PAGE_CACHE_WC
+ * 010 UC- _PAGE_CACHE_UC_MINUS
+ * 011 UC _PAGE_CACHE_UC
+ * PAT bit unused
+ */
+ pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
+ PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+
+ /* Boot CPU check */
+ if (!boot_pat_state)
+ rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+#else
+ /*
+ * PAT settings are part of the hypervisor interface, and their
+ * assignment cannot be changed.
+ */
+ rdmsrl(MSR_IA32_CR_PAT, pat);
+ if (!boot_pat_state)
+ boot_pat_state = pat;
+#endif
+ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+ smp_processor_id(), boot_pat_state, pat);
+}
+
+#undef PAT
+
+static char *cattr_name(unsigned long flags)
+{
+ switch (flags & _PAGE_CACHE_MASK) {
+ case _PAGE_CACHE_UC: return "uncached";
+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_WB: return "write-back";
+ case _PAGE_CACHE_WC: return "write-combining";
+ case _PAGE_CACHE_WP: return "write-protected";
+ case _PAGE_CACHE_WT: return "write-through";
+ default: return "broken";
+ }
+}
+
+/*
+ * The global memtype list keeps track of memory type for specific
+ * physical memory areas. Conflicting memory types in different
+ * mappings can cause CPU cache corruption. To avoid this we keep track.
+ *
+ * The list is sorted based on starting address and can contain multiple
+ * entries for each address (this allows reference counting for overlapping
+ * areas). All the aliases have the same cache attributes of course.
+ * Zero attributes are represented as holes.
+ *
+ * Currently the data structure is a list because the number of mappings
+ * are expected to be relatively small. If this should be a problem
+ * it could be changed to a rbtree or similar.
+ *
+ * memtype_lock protects the whole list.
+ */
+
+struct memtype {
+ u64 start;
+ u64 end;
+ unsigned long type;
+ struct list_head nd;
+};
+
+static LIST_HEAD(memtype_list);
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
+ unsigned long *ret_prot)
+{
+ unsigned long pat_type;
+ u8 mtrr_type;
+
+ pat_type = prot & _PAGE_CACHE_MASK;
+ prot &= (~_PAGE_CACHE_MASK);
+
+ /*
+ * We return the PAT request directly for types where PAT takes
+ * precedence with respect to MTRR and for UC_MINUS.
+ * Consistency checks with other PAT requests is done later
+ * while going through memtype list.
+ */
+ if (pat_type == _PAGE_CACHE_WC) {
+ *ret_prot = prot | _PAGE_CACHE_WC;
+ return 0;
+ } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
+ *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
+ return 0;
+ } else if (pat_type == _PAGE_CACHE_UC) {
+ *ret_prot = prot | _PAGE_CACHE_UC;
+ return 0;
+ }
+
+ /*
+ * Look for MTRR hint to get the effective type in case where PAT
+ * request is for WB.
+ */
+ mtrr_type = mtrr_type_lookup(start, end);
+
+ if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
+ *ret_prot = prot | _PAGE_CACHE_UC;
+ } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
+ *ret_prot = prot | _PAGE_CACHE_WC;
+ } else {
+ *ret_prot = prot | _PAGE_CACHE_WB;
+ }
+
+ return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * req_type will have a special case value '-1', when requester want to inherit
+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
+ *
+ * If ret_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If ret_type is non-null, function will return
+ * available type in ret_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
+ unsigned long *ret_type)
+{
+ struct memtype *new_entry = NULL;
+ struct memtype *parse;
+ unsigned long actual_type;
+ int err = 0;
+
+ /* Only track when pat_wc_enabled */
+ if (!pat_wc_enabled) {
+ /* This is identical to page table setting without PAT */
+ if (ret_type) {
+ if (req_type == -1) {
+ *ret_type = _PAGE_CACHE_WB;
+ } else {
+ *ret_type = req_type;
+ }
+ }
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB in page table. No need to track */
+ if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
+ if (ret_type)
+ *ret_type = _PAGE_CACHE_WB;
+
+ return 0;
+ }
+
+ if (req_type == -1) {
+ /*
+ * Call mtrr_lookup to get the type hint. This is an
+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
+ * tools and ACPI tools). Use WB request for WB memory and use
+ * UC_MINUS otherwise.
+ */
+ u8 mtrr_type = mtrr_type_lookup(start, end);
+
+ if (mtrr_type == MTRR_TYPE_WRBACK) {
+ req_type = _PAGE_CACHE_WB;
+ actual_type = _PAGE_CACHE_WB;
+ } else {
+ req_type = _PAGE_CACHE_UC_MINUS;
+ actual_type = _PAGE_CACHE_UC_MINUS;
+ }
+ } else {
+ req_type &= _PAGE_CACHE_MASK;
+ err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+ }
+
+ if (err) {
+ if (ret_type)
+ *ret_type = actual_type;
+
+ return -EINVAL;
+ }
+
+ new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+
+ new_entry->start = start;
+ new_entry->end = end;
+ new_entry->type = actual_type;
+
+ if (ret_type)
+ *ret_type = actual_type;
+
+ spin_lock(&memtype_lock);
+
+ /* Search for existing mapping that overlaps the current range */
+ list_for_each_entry(parse, &memtype_list, nd) {
+ struct memtype *saved_ptr;
+
+ if (parse->start >= end) {
+ pr_debug("New Entry\n");
+ list_add(&new_entry->nd, parse->nd.prev);
+ new_entry = NULL;
+ break;
+ }
+
+ if (start <= parse->start && end >= parse->start) {
+ if (actual_type != parse->type && ret_type) {
+ actual_type = parse->type;
+ *ret_type = actual_type;
+ new_entry->type = actual_type;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+
+ saved_ptr = parse;
+ /*
+ * Check to see whether the request overlaps more
+ * than one entry in the list
+ */
+ list_for_each_entry_continue(parse, &memtype_list, nd) {
+ if (end <= parse->start) {
+ break;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ if (err) {
+ break;
+ }
+
+ pr_debug("Overlap at 0x%Lx-0x%Lx\n",
+ saved_ptr->start, saved_ptr->end);
+ /* No conflict. Go ahead and add this new entry */
+ list_add(&new_entry->nd, saved_ptr->nd.prev);
+ new_entry = NULL;
+ break;
+ }
+
+ if (start < parse->end) {
+ if (actual_type != parse->type && ret_type) {
+ actual_type = parse->type;
+ *ret_type = actual_type;
+ new_entry->type = actual_type;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+
+ saved_ptr = parse;
+ /*
+ * Check to see whether the request overlaps more
+ * than one entry in the list
+ */
+ list_for_each_entry_continue(parse, &memtype_list, nd) {
+ if (end <= parse->start) {
+ break;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ if (err) {
+ break;
+ }
+
+ pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
+ saved_ptr->start, saved_ptr->end);
+ /* No conflict. Go ahead and add this new entry */
+ list_add(&new_entry->nd, &saved_ptr->nd);
+ new_entry = NULL;
+ break;
+ }
+ }
+
+ if (err) {
+ printk(KERN_INFO
+ "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
+ start, end, cattr_name(new_entry->type),
+ cattr_name(req_type));
+ kfree(new_entry);
+ spin_unlock(&memtype_lock);
+ return err;
+ }
+
+ if (new_entry) {
+ /* No conflict. Not yet added to the list. Add to the tail */
+ list_add_tail(&new_entry->nd, &memtype_list);
+ pr_debug("New Entry\n");
+ }
+
+ if (ret_type) {
+ pr_debug(
+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+ start, end, cattr_name(actual_type),
+ cattr_name(req_type), cattr_name(*ret_type));
+ } else {
+ pr_debug(
+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
+ start, end, cattr_name(actual_type),
+ cattr_name(req_type));
+ }
+
+ spin_unlock(&memtype_lock);
+ return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+ struct memtype *ml;
+ int err = -EINVAL;
+
+ /* Only track when pat_wc_enabled */
+ if (!pat_wc_enabled) {
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB. No need to track */
+ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+ return 0;
+ }
+
+ spin_lock(&memtype_lock);
+ list_for_each_entry(ml, &memtype_list, nd) {
+ if (ml->start == start && ml->end == end) {
+ list_del(&ml->nd);
+ kfree(ml);
+ err = 0;
+ break;
+ }
+ }
+ spin_unlock(&memtype_lock);
+
+ if (err) {
+ printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
+ current->comm, current->pid, start, end);
+ }
+
+ pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+ return err;
+}
+
+
+/*
+ * /dev/mem mmap interface. The memtype used for mapping varies:
+ * - Use UC for mappings with O_SYNC flag
+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
+ * inherit the memtype from existing mapping.
+ * - Else use UC_MINUS memtype (for backward compatibility with existing
+ * X drivers.
+ */
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot)
+{
+ return vma_prot;
+}
+
+#ifdef CONFIG_NONPROMISC_DEVMEM
+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ u64 from = ((u64)pfn) << PAGE_SHIFT;
+ u64 to = from + size;
+ u64 cursor = from;
+
+ while (cursor < to) {
+ if (!devmem_is_allowed(pfn)) {
+ printk(KERN_INFO
+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+ current->comm, from, to);
+ return 0;
+ }
+ cursor += PAGE_SIZE;
+ pfn++;
+ }
+ return 1;
+}
+#endif /* CONFIG_NONPROMISC_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot)
+{
+ u64 offset = ((u64) pfn) << PAGE_SHIFT;
+ unsigned long flags = _PAGE_CACHE_UC_MINUS;
+ int retval;
+
+ if (!range_is_allowed(pfn, size))
+ return 0;
+
+ if (file->f_flags & O_SYNC) {
+ flags = _PAGE_CACHE_UC;
+ }
+
+#ifndef CONFIG_X86_32
+#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
+ /*
+ * On the PPro and successors, the MTRRs are used to set
+ * memory types for physical addresses outside main memory,
+ * so blindly setting UC or PWT on those pages is wrong.
+ * For Pentiums and earlier, the surround logic should disable
+ * caching for the high addresses through the KEN pin, but
+ * we maintain the tradition of paranoia in this code.
+ */
+ if (!pat_wc_enabled &&
+ ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+ flags = _PAGE_CACHE_UC;
+ }
+#endif
+#endif
+
+ /*
+ * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+ * Without O_SYNC, we want to get
+ * - WB for WB-able memory and no other conflicting mappings
+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+ * - Inherit from confliting mappings otherwise
+ */
+ if (flags != _PAGE_CACHE_UC_MINUS) {
+ retval = reserve_memtype(offset, offset + size, flags, NULL);
+ } else {
+ retval = reserve_memtype(offset, offset + size, -1, &flags);
+ }
+
+ if (retval < 0)
+ return 0;
+
+ if (pfn <= max_pfn_mapped &&
+ ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+ free_memtype(offset, offset + size);
+ printk(KERN_INFO
+ "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
+ current->comm, current->pid,
+ cattr_name(flags),
+ offset, (unsigned long long)(offset + size));
+ return 0;
+ }
+
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+ flags);
+ return 1;
+}
+
+void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+ u64 addr = (u64)pfn << PAGE_SHIFT;
+ unsigned long flags;
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+ reserve_memtype(addr, addr + size, want_flags, &flags);
+ if (flags != want_flags) {
+ printk(KERN_INFO
+ "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ addr, (unsigned long long)(addr + size),
+ cattr_name(flags));
+ }
+}
+
+void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+ u64 addr = (u64)pfn << PAGE_SHIFT;
+
+ free_memtype(addr, addr + size);
+}
+
--- /dev/null
+#include <linux/mm.h>
+#include <xen/features.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/hypervisor.h>
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+ if (pte)
+ make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
+ return pte;
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+ if (pte) {
+ pgtable_page_ctor(pte);
+ SetPageForeign(pte, __pte_free);
+ init_page_count(pte);
+ }
+ return pte;
+}
+
+void __pte_free(pgtable_t pte)
+{
+ if (!PageHighMem(pte)) {
+ unsigned long va = (unsigned long)page_address(pte);
+ unsigned int level;
+ pte_t *ptep = lookup_address(va, &level);
+
+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+ if (!pte_write(*ptep)
+ && HYPERVISOR_update_va_mapping(va,
+ mk_pte(pte, PAGE_KERNEL),
+ 0))
+ BUG();
+ } else
+#ifdef CONFIG_HIGHPTE
+ ClearPagePinned(pte);
+#else
+ BUG();
+#endif
+
+ ClearPageForeign(pte);
+ init_page_count(pte);
+ pgtable_page_dtor(pte);
+ __free_page(pte);
+}
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ paravirt_release_pte(page_to_pfn(pte));
+ tlb_remove_page(tlb, pte);
+}
+
+#if PAGETABLE_LEVELS > 2
+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pmd;
+
+ pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ if (!pmd)
+ return NULL;
+ SetPageForeign(pmd, __pmd_free);
+ init_page_count(pmd);
+ return page_address(pmd);
+}
+
+void __pmd_free(pgtable_t pmd)
+{
+ unsigned long va = (unsigned long)page_address(pmd);
+ unsigned int level;
+ pte_t *ptep = lookup_address(va, &level);
+
+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+ if (!pte_write(*ptep)
+ && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
+ BUG();
+
+ ClearPageForeign(pmd);
+ init_page_count(pmd);
+ __free_page(pmd);
+}
+
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD \
+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+static void pgd_ctor(void *p)
+{
+ pgd_t *pgd = p;
+ unsigned long flags;
+
+ pgd_test_and_unpin(pgd);
+
+ /* Clear usermode parts of PGD */
+ memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
+
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ /* If the pgd points to a shared pagetable level (either the
+ ptes in non-PAE, or shared PMD in PAE), then just copy the
+ references from swapper_pg_dir. */
+ if (PAGETABLE_LEVELS == 2 ||
+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ PAGETABLE_LEVELS == 4) {
+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
+ KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ }
+
+#ifdef CONFIG_X86_64
+ /*
+ * Set level3_user_pgt for vsyscall area
+ */
+ __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
+ __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
+#endif
+
+#ifndef CONFIG_X86_PAE
+ /* list required to sync kernel mapping updates */
+ if (!SHARED_KERNEL_PMD)
+ pgd_list_add(pgd);
+#endif
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void pgd_dtor(void *pgd)
+{
+ unsigned long flags; /* can be called from interrupt context */
+
+ if (!SHARED_KERNEL_PMD) {
+ spin_lock_irqsave(&pgd_lock, flags);
+ pgd_list_del(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+
+ pgd_test_and_unpin(pgd);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+ int i;
+
+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+ pgd_t pgd = pgdp[i];
+
+ if (__pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgdp[i] = xen_make_pgd(0);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ }
+ }
+
+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+ xen_destroy_contiguous_region((unsigned long)pgdp, 0);
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ pud_t *pud;
+ pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
+ unsigned long addr, flags;
+ int i;
+
+ /*
+ * We can race save/restore (if we sleep during a GFP_KERNEL memory
+ * allocation). We therefore store virtual addresses of pmds as they
+ * do not change across save/restore, and poke the machine addresses
+ * into the pgdir under the pgd_lock.
+ */
+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
+ pmds[i] = pmd_alloc_one(mm, addr);
+ if (!pmds[i])
+ goto out_oom;
+ }
+
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ /* Protect against save/restore: move below 4GB under pgd_lock. */
+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+ spin_unlock_irqrestore(&pgd_lock, flags);
+out_oom:
+ while (i--)
+ pmd_free(mm, pmds[i]);
+ return 0;
+ }
+
+ /* Copy kernel pmd contents and write-protect the new pmds. */
+ pud = pud_offset(pgd, 0);
+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+ i++, pud++, addr += PUD_SIZE) {
+ if (i >= KERNEL_PGD_BOUNDARY) {
+ memcpy(pmds[i],
+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+ make_lowmem_page_readonly(
+ pmds[i], XENFEAT_writable_page_tables);
+ }
+
+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
+ pud_populate(mm, pud, pmds[i]);
+ }
+
+ /* List required to sync kernel mapping updates and
+ * to pin/unpin on save/restore. */
+ pgd_list_add(pgd);
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
+
+ return 1;
+}
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ struct page *page = virt_to_page(pmd);
+ unsigned long pfn = page_to_pfn(page);
+
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ if (PagePinned(virt_to_page(mm->pgd))) {
+ BUG_ON(PageHighMem(page));
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+ } else
+ *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ if (mm == current->active_mm)
+ xen_tlb_flush();
+}
+#else /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ return 1;
+}
+
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+{
+}
+#endif /* CONFIG_X86_PAE */
+
+#ifdef CONFIG_X86_64
+/* We allocate two contiguous pages for kernel and user. */
+#define PGD_ORDER 1
+#else
+#define PGD_ORDER 0
+#endif
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
+
+ /* so that alloc_pd can use it */
+ mm->pgd = pgd;
+ if (pgd)
+ pgd_ctor(pgd);
+
+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+ free_pages((unsigned long)pgd, PGD_ORDER);
+ pgd = NULL;
+ }
+
+ return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ /*
+ * After this the pgd should not be pinned for the duration of this
+ * function's execution. We should never sleep and thus never race:
+ * 1. User pmds will not become write-protected under our feet due
+ * to a concurrent mm_pin_all().
+ * 2. The machine addresses in PGD entries will not become invalid
+ * due to a concurrent save/restore.
+ */
+ pgd_dtor(pgd);
+
+ pgd_mop_up_pmds(mm, pgd);
+ free_pages((unsigned long)pgd, PGD_ORDER);
+}
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+
+ if (changed && dirty) {
+ if (likely(vma->vm_mm == current->mm)) {
+ if (HYPERVISOR_update_va_mapping(address,
+ entry,
+ (unsigned long)vma->vm_mm->cpu_vm_mask.bits|
+ UVMF_INVLPG|UVMF_MULTI))
+ BUG();
+ } else {
+ xen_l1_entry_update(ptep, entry);
+ flush_tlb_page(vma, address);
+ }
+ }
+
+ return changed;
+}
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+
+ if (pte_young(*ptep))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ &ptep->pte);
+
+ if (ret)
+ pte_update(vma->vm_mm, addr, ptep);
+
+ return ret;
+}
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t pte = *ptep;
+ int young = pte_young(pte);
+
+ pte = pte_mkold(pte);
+ if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
+ ptep_set_access_flags(vma, address, ptep, pte, young);
+ else if (young)
+ ptep->pte_low = pte.pte_low;
+
+ return young;
+}
--- /dev/null
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/quicklist.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+#include <xen/features.h>
+#include <asm/hypervisor.h>
+
+void show_mem(void)
+{
+ int total = 0, reserved = 0;
+ int shared = 0, cached = 0;
+ int highmem = 0;
+ struct page *page;
+ pg_data_t *pgdat;
+ unsigned long i;
+ unsigned long flags;
+
+ printk(KERN_INFO "Mem-info:\n");
+ show_free_areas();
+ for_each_online_pgdat(pgdat) {
+ pgdat_resize_lock(pgdat, &flags);
+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+ if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
+ touch_nmi_watchdog();
+ page = pgdat_page_nr(pgdat, i);
+ total++;
+ if (PageHighMem(page))
+ highmem++;
+ if (PageReserved(page))
+ reserved++;
+ else if (PageSwapCache(page))
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+ }
+ pgdat_resize_unlock(pgdat, &flags);
+ }
+ printk(KERN_INFO "%d pages of RAM\n", total);
+ printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
+ printk(KERN_INFO "%d reserved pages\n", reserved);
+ printk(KERN_INFO "%d pages shared\n", shared);
+ printk(KERN_INFO "%d pages swap cached\n", cached);
+
+ printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
+ printk(KERN_INFO "%lu pages writeback\n",
+ global_page_state(NR_WRITEBACK));
+ printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
+ printk(KERN_INFO "%lu pages slab\n",
+ global_page_state(NR_SLAB_RECLAIMABLE) +
+ global_page_state(NR_SLAB_UNRECLAIMABLE));
+ printk(KERN_INFO "%lu pages pagetables\n",
+ global_page_state(NR_PAGETABLE));
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned.
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
+ printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+ return; /* BUG(); */
+ }
+ if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
+ printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+ return; /* BUG(); */
+ }
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+ return; /* BUG(); */
+ }
+ pud = pud_offset(pgd, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ set_pmd(pmd, pfn_pmd(pfn, flags));
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+static int fixmaps;
+unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
+unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
+{
+ unsigned long address = __fix_to_virt(idx);
+ pte_t pte;
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ switch (idx) {
+ case FIX_WP_TEST:
+ case FIX_VDSO:
+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+ break;
+ default:
+ pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
+ break;
+ }
+ if (HYPERVISOR_update_va_mapping(address, pte,
+ UVMF_INVLPG|UVMF_ALL))
+ BUG();
+ fixmaps++;
+}
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+ BUG_ON(fixmaps > 0);
+ printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+ (int)-reserve);
+ __FIXADDR_TOP = -reserve - PAGE_SIZE;
+ __VMALLOC_RESERVE += reserve;
+}
+
+void make_lowmem_page_readonly(void *va, unsigned int feature)
+{
+ pte_t *pte;
+ unsigned int level;
+ int rc;
+
+ if (xen_feature(feature))
+ return;
+
+ pte = lookup_address((unsigned long)va, &level);
+ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+ rc = HYPERVISOR_update_va_mapping(
+ (unsigned long)va, pte_wrprotect(*pte), 0);
+ BUG_ON(rc);
+}
+
+void make_lowmem_page_writable(void *va, unsigned int feature)
+{
+ pte_t *pte;
+ unsigned int level;
+ int rc;
+
+ if (xen_feature(feature))
+ return;
+
+ pte = lookup_address((unsigned long)va, &level);
+ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+ rc = HYPERVISOR_update_va_mapping(
+ (unsigned long)va, pte_mkwrite(*pte), 0);
+ BUG_ON(rc);
+}
oprofilefs.o oprofile_stats.o \
timer_int.o )
+ifdef CONFIG_XEN
+XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
+ xenoprofile.o)
+oprofile-y := $(DRIVER_OBJS) \
+ $(XENOPROF_COMMON_OBJS) xenoprof.o
+else
oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \
op_model_ppro.o op_model_p4.o
oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
+endif
--- /dev/null
+/**
+ * @file xenoprof.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon and Jose Renato Santos for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
+ * x86-specific part
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ */
+
+#include <linux/init.h>
+#include <linux/oprofile.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+
+#include <xen/driver_util.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xenoprof.h>
+#include <xen/xenoprof.h>
+#include "op_counter.h"
+
+static unsigned int num_events = 0;
+
+void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
+{
+ num_events = init->num_events;
+ /* just in case - make sure we do not overflow event list
+ (i.e. counter_config list) */
+ if (num_events > OP_MAX_COUNTER) {
+ num_events = OP_MAX_COUNTER;
+ init->num_events = num_events;
+ }
+}
+
+void xenoprof_arch_counter(void)
+{
+ int i;
+ struct xenoprof_counter counter;
+
+ for (i=0; i<num_events; i++) {
+ counter.ind = i;
+ counter.count = (uint64_t)counter_config[i].count;
+ counter.enabled = (uint32_t)counter_config[i].enabled;
+ counter.event = (uint32_t)counter_config[i].event;
+ counter.kernel = (uint32_t)counter_config[i].kernel;
+ counter.user = (uint32_t)counter_config[i].user;
+ counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
+ WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
+ &counter));
+ }
+}
+
+void xenoprof_arch_start(void)
+{
+ /* nothing */
+}
+
+void xenoprof_arch_stop(void)
+{
+ /* nothing */
+}
+
+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
+{
+ if (sbuf->buffer) {
+ vunmap(sbuf->buffer);
+ sbuf->buffer = NULL;
+ }
+}
+
+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
+ struct xenoprof_shared_buffer * sbuf)
+{
+ int npages, ret;
+ struct vm_struct *area;
+
+ sbuf->buffer = NULL;
+ if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
+ return ret;
+
+ npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
+
+ area = alloc_vm_area(npages * PAGE_SIZE);
+ if (area == NULL)
+ return -ENOMEM;
+
+ if ( (ret = direct_kernel_remap_pfn_range(
+ (unsigned long)area->addr,
+ get_buffer->buf_gmaddr >> PAGE_SHIFT,
+ npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
+ DOMID_SELF)) ) {
+ vunmap(area->addr);
+ return ret;
+ }
+
+ sbuf->buffer = area->addr;
+ return ret;
+}
+
+int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
+ struct xenoprof_shared_buffer * sbuf)
+{
+ int ret;
+ int npages;
+ struct vm_struct *area;
+ pgprot_t prot = __pgprot(_KERNPG_TABLE);
+
+ sbuf->buffer = NULL;
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
+ if (ret)
+ goto out;
+
+ npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
+
+ area = alloc_vm_area(npages * PAGE_SIZE);
+ if (area == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = direct_kernel_remap_pfn_range(
+ (unsigned long)area->addr,
+ pdomain->buf_gmaddr >> PAGE_SHIFT,
+ npages * PAGE_SIZE, prot, DOMID_SELF);
+ if (ret) {
+ vunmap(area->addr);
+ goto out;
+ }
+ sbuf->buffer = area->addr;
+
+out:
+ return ret;
+}
+
+struct op_counter_config counter_config[OP_MAX_COUNTER];
+
+int xenoprof_create_files(struct super_block * sb, struct dentry * root)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_events; ++i) {
+ struct dentry * dir;
+ char buf[2];
+
+ snprintf(buf, 2, "%d", i);
+ dir = oprofilefs_mkdir(sb, root, buf);
+ oprofilefs_create_ulong(sb, dir, "enabled",
+ &counter_config[i].enabled);
+ oprofilefs_create_ulong(sb, dir, "event",
+ &counter_config[i].event);
+ oprofilefs_create_ulong(sb, dir, "count",
+ &counter_config[i].count);
+ oprofilefs_create_ulong(sb, dir, "unit_mask",
+ &counter_config[i].unit_mask);
+ oprofilefs_create_ulong(sb, dir, "kernel",
+ &counter_config[i].kernel);
+ oprofilefs_create_ulong(sb, dir, "user",
+ &counter_config[i].user);
+ }
+
+ return 0;
+}
+
+int __init oprofile_arch_init(struct oprofile_operations * ops)
+{
+ return xenoprofile_init(ops);
+}
+
+void oprofile_arch_exit(void)
+{
+ xenoprofile_exit();
+}
else
include ${srctree}/arch/x86/pci/Makefile_64
endif
+
+# pcifront should be after mmconfig.o and direct.o as it should only
+# take over if direct access to the PCI bus is unavailable
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
--- /dev/null
+/*
+ * Low-Level PCI Support for PC -- Routing of Interrupts
+ *
+ * (c) 1999--2000 Martin Mares <mj@ucw.cz>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/dmi.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/io_apic.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
+
+#include "pci.h"
+
+#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
+#define PIRQ_VERSION 0x0100
+
+static int broken_hp_bios_irq9;
+static int acer_tm360_irqrouting;
+
+static struct irq_routing_table *pirq_table;
+
+static int pirq_enable_irq(struct pci_dev *dev);
+
+/*
+ * Never use: 0, 1, 2 (timer, keyboard, and cascade)
+ * Avoid using: 13, 14 and 15 (FP error and IDE).
+ * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
+ */
+unsigned int pcibios_irq_mask = 0xfff8;
+
+static int pirq_penalty[16] = {
+ 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
+ 0, 0, 0, 0, 1000, 100000, 100000, 100000
+};
+
+struct irq_router {
+ char *name;
+ u16 vendor, device;
+ int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
+ int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
+};
+
+struct irq_router_handler {
+ u16 vendor;
+ int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
+};
+
+int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
+
+/*
+ * Check passed address for the PCI IRQ Routing Table signature
+ * and perform checksum verification.
+ */
+
+static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
+{
+ struct irq_routing_table *rt;
+ int i;
+ u8 sum;
+
+ rt = (struct irq_routing_table *) addr;
+ if (rt->signature != PIRQ_SIGNATURE ||
+ rt->version != PIRQ_VERSION ||
+ rt->size % 16 ||
+ rt->size < sizeof(struct irq_routing_table))
+ return NULL;
+ sum = 0;
+ for (i=0; i < rt->size; i++)
+ sum += addr[i];
+ if (!sum) {
+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
+ return rt;
+ }
+ return NULL;
+}
+
+
+
+/*
+ * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
+ */
+
+static struct irq_routing_table * __init pirq_find_routing_table(void)
+{
+ u8 *addr;
+ struct irq_routing_table *rt;
+
+#ifdef CONFIG_XEN
+ if (!is_initial_xendomain())
+ return NULL;
+#endif
+ if (pirq_table_addr) {
+ rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
+ if (rt)
+ return rt;
+ printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
+ }
+ for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
+ rt = pirq_check_routing_table(addr);
+ if (rt)
+ return rt;
+ }
+ return NULL;
+}
+
+/*
+ * If we have a IRQ routing table, use it to search for peer host
+ * bridges. It's a gross hack, but since there are no other known
+ * ways how to get a list of buses, we have to go this way.
+ */
+
+static void __init pirq_peer_trick(void)
+{
+ struct irq_routing_table *rt = pirq_table;
+ u8 busmap[256];
+ int i;
+ struct irq_info *e;
+
+ memset(busmap, 0, sizeof(busmap));
+ for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
+ e = &rt->slots[i];
+#ifdef DEBUG
+ {
+ int j;
+ DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
+ for(j=0; j<4; j++)
+ DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
+ DBG("\n");
+ }
+#endif
+ busmap[e->bus] = 1;
+ }
+ for(i = 1; i < 256; i++) {
+ int node;
+ if (!busmap[i] || pci_find_bus(0, i))
+ continue;
+ node = get_mp_bus_to_node(i);
+ if (pci_scan_bus_on_node(i, &pci_root_ops, node))
+ printk(KERN_INFO "PCI: Discovered primary peer "
+ "bus %02x [IRQ]\n", i);
+ }
+ pcibios_last_bus = -1;
+}
+
+/*
+ * Code for querying and setting of IRQ routes on various interrupt routers.
+ */
+
+void eisa_set_level_irq(unsigned int irq)
+{
+ unsigned char mask = 1 << (irq & 7);
+ unsigned int port = 0x4d0 + (irq >> 3);
+ unsigned char val;
+ static u16 eisa_irq_mask;
+
+ if (irq >= 16 || (1 << irq) & eisa_irq_mask)
+ return;
+
+ eisa_irq_mask |= (1 << irq);
+ printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
+ val = inb(port);
+ if (!(val & mask)) {
+ DBG(KERN_DEBUG " -> edge");
+ outb(val | mask, port);
+ }
+}
+
+/*
+ * Common IRQ routing practice: nibbles in config space,
+ * offset by some magic constant.
+ */
+static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
+{
+ u8 x;
+ unsigned reg = offset + (nr >> 1);
+
+ pci_read_config_byte(router, reg, &x);
+ return (nr & 1) ? (x >> 4) : (x & 0xf);
+}
+
+static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
+{
+ u8 x;
+ unsigned reg = offset + (nr >> 1);
+
+ pci_read_config_byte(router, reg, &x);
+ x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
+ pci_write_config_byte(router, reg, x);
+}
+
+/*
+ * ALI pirq entries are damn ugly, and completely undocumented.
+ * This has been figured out from pirq tables, and it's not a pretty
+ * picture.
+ */
+static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
+
+ WARN_ON_ONCE(pirq > 16);
+ return irqmap[read_config_nybble(router, 0x48, pirq-1)];
+}
+
+static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
+ unsigned int val = irqmap[irq];
+
+ WARN_ON_ONCE(pirq > 16);
+ if (val) {
+ write_config_nybble(router, 0x48, pirq-1, val);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
+ * just a pointer to the config space.
+ */
+static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ u8 x;
+
+ pci_read_config_byte(router, pirq, &x);
+ return (x < 16) ? x : 0;
+}
+
+static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ pci_write_config_byte(router, pirq, irq);
+ return 1;
+}
+
+/*
+ * The VIA pirq rules are nibble-based, like ALI,
+ * but without the ugly irq number munging.
+ * However, PIRQD is in the upper instead of lower 4 bits.
+ */
+static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
+}
+
+static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
+ return 1;
+}
+
+/*
+ * The VIA pirq rules are nibble-based, like ALI,
+ * but without the ugly irq number munging.
+ * However, for 82C586, nibble map is different .
+ */
+static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+ WARN_ON_ONCE(pirq > 5);
+ return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
+}
+
+static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+ WARN_ON_ONCE(pirq > 5);
+ write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
+ return 1;
+}
+
+/*
+ * ITE 8330G pirq rules are nibble-based
+ * FIXME: pirqmap may be { 1, 0, 3, 2 },
+ * 2+3 are both mapped to irq 9 on my system
+ */
+static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+ WARN_ON_ONCE(pirq > 4);
+ return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+}
+
+static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+ WARN_ON_ONCE(pirq > 4);
+ write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
+ return 1;
+}
+
+/*
+ * OPTI: high four bits are nibble pointer..
+ * I wonder what the low bits do?
+ */
+static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ return read_config_nybble(router, 0xb8, pirq >> 4);
+}
+
+static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ write_config_nybble(router, 0xb8, pirq >> 4, irq);
+ return 1;
+}
+
+/*
+ * Cyrix: nibble offset 0x5C
+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
+ * 0x5D bits 7:4 is INTD bits 3:0 is INTC
+ */
+static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ return read_config_nybble(router, 0x5C, (pirq-1)^1);
+}
+
+static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
+ return 1;
+}
+
+/*
+ * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
+ * We have to deal with the following issues here:
+ * - vendors have different ideas about the meaning of link values
+ * - some onboard devices (integrated in the chipset) have special
+ * links and are thus routed differently (i.e. not via PCI INTA-INTD)
+ * - different revision of the router have a different layout for
+ * the routing registers, particularly for the onchip devices
+ *
+ * For all routing registers the common thing is we have one byte
+ * per routeable link which is defined as:
+ * bit 7 IRQ mapping enabled (0) or disabled (1)
+ * bits [6:4] reserved (sometimes used for onchip devices)
+ * bits [3:0] IRQ to map to
+ * allowed: 3-7, 9-12, 14-15
+ * reserved: 0, 1, 2, 8, 13
+ *
+ * The config-space registers located at 0x41/0x42/0x43/0x44 are
+ * always used to route the normal PCI INT A/B/C/D respectively.
+ * Apparently there are systems implementing PCI routing table using
+ * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
+ * We try our best to handle both link mappings.
+ *
+ * Currently (2003-05-21) it appears most SiS chipsets follow the
+ * definition of routing registers from the SiS-5595 southbridge.
+ * According to the SiS 5595 datasheets the revision id's of the
+ * router (ISA-bridge) should be 0x01 or 0xb0.
+ *
+ * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
+ * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
+ * They seem to work with the current routing code. However there is
+ * some concern because of the two USB-OHCI HCs (original SiS 5595
+ * had only one). YMMV.
+ *
+ * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
+ *
+ * 0x61: IDEIRQ:
+ * bits [6:5] must be written 01
+ * bit 4 channel-select primary (0), secondary (1)
+ *
+ * 0x62: USBIRQ:
+ * bit 6 OHCI function disabled (0), enabled (1)
+ *
+ * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
+ *
+ * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
+ *
+ * We support USBIRQ (in addition to INTA-INTD) and keep the
+ * IDE, ACPI and DAQ routing untouched as set by the BIOS.
+ *
+ * Currently the only reported exception is the new SiS 65x chipset
+ * which includes the SiS 69x southbridge. Here we have the 85C503
+ * router revision 0x04 and there are changes in the register layout
+ * mostly related to the different USB HCs with USB 2.0 support.
+ *
+ * Onchip routing for router rev-id 0x04 (try-and-error observation)
+ *
+ * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
+ * bit 6-4 are probably unused, not like 5595
+ */
+
+#define PIRQ_SIS_IRQ_MASK 0x0f
+#define PIRQ_SIS_IRQ_DISABLE 0x80
+#define PIRQ_SIS_USB_ENABLE 0x40
+
+static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ u8 x;
+ int reg;
+
+ reg = pirq;
+ if (reg >= 0x01 && reg <= 0x04)
+ reg += 0x40;
+ pci_read_config_byte(router, reg, &x);
+ return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
+}
+
+static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ u8 x;
+ int reg;
+
+ reg = pirq;
+ if (reg >= 0x01 && reg <= 0x04)
+ reg += 0x40;
+ pci_read_config_byte(router, reg, &x);
+ x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
+ x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
+ pci_write_config_byte(router, reg, x);
+ return 1;
+}
+
+
+/*
+ * VLSI: nibble offset 0x74 - educated guess due to routing table and
+ * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
+ * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
+ * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
+ * for the busbridge to the docking station.
+ */
+
+static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ WARN_ON_ONCE(pirq >= 9);
+ if (pirq > 8) {
+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ return 0;
+ }
+ return read_config_nybble(router, 0x74, pirq-1);
+}
+
+static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ WARN_ON_ONCE(pirq >= 9);
+ if (pirq > 8) {
+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ return 0;
+ }
+ write_config_nybble(router, 0x74, pirq-1, irq);
+ return 1;
+}
+
+/*
+ * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
+ * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
+ * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
+ * register is a straight binary coding of desired PIC IRQ (low nibble).
+ *
+ * The 'link' value in the PIRQ table is already in the correct format
+ * for the Index register. There are some special index values:
+ * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
+ * and 0x03 for SMBus.
+ */
+static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ outb(pirq, 0xc00);
+ return inb(0xc01) & 0xf;
+}
+
+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ outb(pirq, 0xc00);
+ outb(irq, 0xc01);
+ return 1;
+}
+
+/* Support for AMD756 PCI IRQ Routing
+ * Jhon H. Caicedo <jhcaiced@osso.org.co>
+ * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
+ * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
+ * The AMD756 pirq rules are nibble-based
+ * offset 0x56 0-3 PIRQA 4-7 PIRQB
+ * offset 0x57 0-3 PIRQC 4-7 PIRQD
+ */
+static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ u8 irq;
+ irq = 0;
+ if (pirq <= 4)
+ {
+ irq = read_config_nybble(router, 0x56, pirq - 1);
+ }
+ printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
+ dev->vendor, dev->device, pirq, irq);
+ return irq;
+}
+
+static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
+ dev->vendor, dev->device, pirq, irq);
+ if (pirq <= 4)
+ {
+ write_config_nybble(router, 0x56, pirq - 1, irq);
+ }
+ return 1;
+}
+
+/*
+ * PicoPower PT86C523
+ */
+static int pirq_pico_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ outb(0x10 + ((pirq - 1) >> 1), 0x24);
+ return ((pirq - 1) & 1) ? (inb(0x26) >> 4) : (inb(0x26) & 0xf);
+}
+
+static int pirq_pico_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
+ int irq)
+{
+ unsigned int x;
+ outb(0x10 + ((pirq - 1) >> 1), 0x24);
+ x = inb(0x26);
+ x = ((pirq - 1) & 1) ? ((x & 0x0f) | (irq << 4)) : ((x & 0xf0) | (irq));
+ outb(x, 0x26);
+ return 1;
+}
+
+#ifdef CONFIG_PCI_BIOS
+
+static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ struct pci_dev *bridge;
+ int pin = pci_get_interrupt_pin(dev, &bridge);
+ return pcibios_set_irq_routing(bridge, pin, irq);
+}
+
+#endif
+
+static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ static struct pci_device_id __initdata pirq_440gx[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
+ { },
+ };
+
+ /* 440GX has a proprietary PIRQ router -- don't use it */
+ if (pci_dev_present(pirq_440gx))
+ return 0;
+
+ switch(device)
+ {
+ case PCI_DEVICE_ID_INTEL_82371FB_0:
+ case PCI_DEVICE_ID_INTEL_82371SB_0:
+ case PCI_DEVICE_ID_INTEL_82371AB_0:
+ case PCI_DEVICE_ID_INTEL_82371MX:
+ case PCI_DEVICE_ID_INTEL_82443MX_0:
+ case PCI_DEVICE_ID_INTEL_82801AA_0:
+ case PCI_DEVICE_ID_INTEL_82801AB_0:
+ case PCI_DEVICE_ID_INTEL_82801BA_0:
+ case PCI_DEVICE_ID_INTEL_82801BA_10:
+ case PCI_DEVICE_ID_INTEL_82801CA_0:
+ case PCI_DEVICE_ID_INTEL_82801CA_12:
+ case PCI_DEVICE_ID_INTEL_82801DB_0:
+ case PCI_DEVICE_ID_INTEL_82801E_0:
+ case PCI_DEVICE_ID_INTEL_82801EB_0:
+ case PCI_DEVICE_ID_INTEL_ESB_1:
+ case PCI_DEVICE_ID_INTEL_ICH6_0:
+ case PCI_DEVICE_ID_INTEL_ICH6_1:
+ case PCI_DEVICE_ID_INTEL_ICH7_0:
+ case PCI_DEVICE_ID_INTEL_ICH7_1:
+ case PCI_DEVICE_ID_INTEL_ICH7_30:
+ case PCI_DEVICE_ID_INTEL_ICH7_31:
+ case PCI_DEVICE_ID_INTEL_ESB2_0:
+ case PCI_DEVICE_ID_INTEL_ICH8_0:
+ case PCI_DEVICE_ID_INTEL_ICH8_1:
+ case PCI_DEVICE_ID_INTEL_ICH8_2:
+ case PCI_DEVICE_ID_INTEL_ICH8_3:
+ case PCI_DEVICE_ID_INTEL_ICH8_4:
+ case PCI_DEVICE_ID_INTEL_ICH9_0:
+ case PCI_DEVICE_ID_INTEL_ICH9_1:
+ case PCI_DEVICE_ID_INTEL_ICH9_2:
+ case PCI_DEVICE_ID_INTEL_ICH9_3:
+ case PCI_DEVICE_ID_INTEL_ICH9_4:
+ case PCI_DEVICE_ID_INTEL_ICH9_5:
+ case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_1:
+ case PCI_DEVICE_ID_INTEL_ICH10_2:
+ case PCI_DEVICE_ID_INTEL_ICH10_3:
+ r->name = "PIIX/ICH";
+ r->get = pirq_piix_get;
+ r->set = pirq_piix_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int via_router_probe(struct irq_router *r,
+ struct pci_dev *router, u16 device)
+{
+ /* FIXME: We should move some of the quirk fixup stuff here */
+
+ /*
+ * workarounds for some buggy BIOSes
+ */
+ if (device == PCI_DEVICE_ID_VIA_82C586_0) {
+ switch(router->device) {
+ case PCI_DEVICE_ID_VIA_82C686:
+ /*
+ * Asus k7m bios wrongly reports 82C686A
+ * as 586-compatible
+ */
+ device = PCI_DEVICE_ID_VIA_82C686;
+ break;
+ case PCI_DEVICE_ID_VIA_8235:
+ /**
+ * Asus a7v-x bios wrongly reports 8235
+ * as 586-compatible
+ */
+ device = PCI_DEVICE_ID_VIA_8235;
+ break;
+ case PCI_DEVICE_ID_VIA_8237:
+ /**
+ * Asus a7v600 bios wrongly reports 8237
+ * as 586-compatible
+ */
+ device = PCI_DEVICE_ID_VIA_8237;
+ break;
+ }
+ }
+
+ switch(device) {
+ case PCI_DEVICE_ID_VIA_82C586_0:
+ r->name = "VIA";
+ r->get = pirq_via586_get;
+ r->set = pirq_via586_set;
+ return 1;
+ case PCI_DEVICE_ID_VIA_82C596:
+ case PCI_DEVICE_ID_VIA_82C686:
+ case PCI_DEVICE_ID_VIA_8231:
+ case PCI_DEVICE_ID_VIA_8233A:
+ case PCI_DEVICE_ID_VIA_8235:
+ case PCI_DEVICE_ID_VIA_8237:
+ /* FIXME: add new ones for 8233/5 */
+ r->name = "VIA";
+ r->get = pirq_via_get;
+ r->set = pirq_via_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch(device)
+ {
+ case PCI_DEVICE_ID_VLSI_82C534:
+ r->name = "VLSI 82C534";
+ r->get = pirq_vlsi_get;
+ r->set = pirq_vlsi_set;
+ return 1;
+ }
+ return 0;
+}
+
+
+static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch(device)
+ {
+ case PCI_DEVICE_ID_SERVERWORKS_OSB4:
+ case PCI_DEVICE_ID_SERVERWORKS_CSB5:
+ r->name = "ServerWorks";
+ r->get = pirq_serverworks_get;
+ r->set = pirq_serverworks_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ if (device != PCI_DEVICE_ID_SI_503)
+ return 0;
+
+ r->name = "SIS";
+ r->get = pirq_sis_get;
+ r->set = pirq_sis_set;
+ return 1;
+}
+
+static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch(device)
+ {
+ case PCI_DEVICE_ID_CYRIX_5520:
+ r->name = "NatSemi";
+ r->get = pirq_cyrix_get;
+ r->set = pirq_cyrix_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch(device)
+ {
+ case PCI_DEVICE_ID_OPTI_82C700:
+ r->name = "OPTI";
+ r->get = pirq_opti_get;
+ r->set = pirq_opti_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch(device)
+ {
+ case PCI_DEVICE_ID_ITE_IT8330G_0:
+ r->name = "ITE";
+ r->get = pirq_ite_get;
+ r->set = pirq_ite_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch(device)
+ {
+ case PCI_DEVICE_ID_AL_M1533:
+ case PCI_DEVICE_ID_AL_M1563:
+ printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
+ r->name = "ALI";
+ r->get = pirq_ali_get;
+ r->set = pirq_ali_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch(device)
+ {
+ case PCI_DEVICE_ID_AMD_VIPER_740B:
+ r->name = "AMD756";
+ break;
+ case PCI_DEVICE_ID_AMD_VIPER_7413:
+ r->name = "AMD766";
+ break;
+ case PCI_DEVICE_ID_AMD_VIPER_7443:
+ r->name = "AMD768";
+ break;
+ default:
+ return 0;
+ }
+ r->get = pirq_amd756_get;
+ r->set = pirq_amd756_set;
+ return 1;
+}
+
+static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_PICOPOWER_PT86C523:
+ r->name = "PicoPower PT86C523";
+ r->get = pirq_pico_get;
+ r->set = pirq_pico_set;
+ return 1;
+
+ case PCI_DEVICE_ID_PICOPOWER_PT86C523BBP:
+ r->name = "PicoPower PT86C523 rev. BB+";
+ r->get = pirq_pico_get;
+ r->set = pirq_pico_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __initdata struct irq_router_handler pirq_routers[] = {
+ { PCI_VENDOR_ID_INTEL, intel_router_probe },
+ { PCI_VENDOR_ID_AL, ali_router_probe },
+ { PCI_VENDOR_ID_ITE, ite_router_probe },
+ { PCI_VENDOR_ID_VIA, via_router_probe },
+ { PCI_VENDOR_ID_OPTI, opti_router_probe },
+ { PCI_VENDOR_ID_SI, sis_router_probe },
+ { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
+ { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
+ { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
+ { PCI_VENDOR_ID_AMD, amd_router_probe },
+ { PCI_VENDOR_ID_PICOPOWER, pico_router_probe },
+ /* Someone with docs needs to add the ATI Radeon IGP */
+ { 0, NULL }
+};
+static struct irq_router pirq_router;
+static struct pci_dev *pirq_router_dev;
+
+
+/*
+ * FIXME: should we have an option to say "generic for
+ * chipset" ?
+ */
+
+static void __init pirq_find_router(struct irq_router *r)
+{
+ struct irq_routing_table *rt = pirq_table;
+ struct irq_router_handler *h;
+
+#ifdef CONFIG_PCI_BIOS
+ if (!rt->signature) {
+ printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
+ r->set = pirq_bios_set;
+ r->name = "BIOS";
+ return;
+ }
+#endif
+
+ /* Default unless a driver reloads it */
+ r->name = "default";
+ r->get = NULL;
+ r->set = NULL;
+
+ DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
+ rt->rtr_vendor, rt->rtr_device);
+
+ pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
+ if (!pirq_router_dev) {
+ DBG(KERN_DEBUG "PCI: Interrupt router not found at "
+ "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
+ return;
+ }
+
+ for( h = pirq_routers; h->vendor; h++) {
+ /* First look for a router match */
+ if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
+ break;
+ /* Fall back to a device match */
+ if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
+ break;
+ }
+ printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
+ pirq_router.name,
+ pirq_router_dev->vendor,
+ pirq_router_dev->device,
+ pci_name(pirq_router_dev));
+
+ /* The device remains referenced for the kernel lifetime */
+}
+
+static struct irq_info *pirq_get_info(struct pci_dev *dev)
+{
+ struct irq_routing_table *rt = pirq_table;
+ int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
+ struct irq_info *info;
+
+ for (info = rt->slots; entries--; info++)
+ if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+ return info;
+ return NULL;
+}
+
+static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
+{
+ u8 pin;
+ struct irq_info *info;
+ int i, pirq, newirq;
+ int irq = 0;
+ u32 mask;
+ struct irq_router *r = &pirq_router;
+ struct pci_dev *dev2 = NULL;
+ char *msg = NULL;
+
+ /* Find IRQ pin */
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (!pin) {
+ DBG(KERN_DEBUG " -> no interrupt pin\n");
+ return 0;
+ }
+ pin = pin - 1;
+
+ /* Find IRQ routing entry */
+
+ if (!pirq_table)
+ return 0;
+
+ DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
+ info = pirq_get_info(dev);
+ if (!info) {
+ DBG(" -> not found in routing table\n" KERN_DEBUG);
+ return 0;
+ }
+ pirq = info->irq[pin].link;
+ mask = info->irq[pin].bitmap;
+ if (!pirq) {
+ DBG(" -> not routed\n" KERN_DEBUG);
+ return 0;
+ }
+ DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
+ mask &= pcibios_irq_mask;
+
+ /* Work around broken HP Pavilion Notebooks which assign USB to
+ IRQ 9 even though it is actually wired to IRQ 11 */
+
+ if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
+ dev->irq = 11;
+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
+ r->set(pirq_router_dev, dev, pirq, 11);
+ }
+
+ /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
+ if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
+ pirq = 0x68;
+ mask = 0x400;
+ dev->irq = r->get(pirq_router_dev, dev, pirq);
+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
+ }
+
+ /*
+ * Find the best IRQ to assign: use the one
+ * reported by the device if possible.
+ */
+ newirq = dev->irq;
+ if (newirq && !((1 << newirq) & mask)) {
+ if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
+ else printk("\n" KERN_WARNING
+ "PCI: IRQ %i for device %s doesn't match PIRQ mask "
+ "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
+ pci_name(dev));
+ }
+ if (!newirq && assign) {
+ for (i = 0; i < 16; i++) {
+ if (!(mask & (1 << i)))
+ continue;
+ if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
+ newirq = i;
+ }
+ }
+ DBG(" -> newirq=%d", newirq);
+
+ /* Check if it is hardcoded */
+ if ((pirq & 0xf0) == 0xf0) {
+ irq = pirq & 0xf;
+ DBG(" -> hardcoded IRQ %d\n", irq);
+ msg = "Hardcoded";
+ } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
+ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
+ DBG(" -> got IRQ %d\n", irq);
+ msg = "Found";
+ eisa_set_level_irq(irq);
+ } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
+ DBG(" -> assigning IRQ %d", newirq);
+ if (r->set(pirq_router_dev, dev, pirq, newirq)) {
+ eisa_set_level_irq(newirq);
+ DBG(" ... OK\n");
+ msg = "Assigned";
+ irq = newirq;
+ }
+ }
+
+ if (!irq) {
+ DBG(" ... failed\n");
+ if (newirq && mask == (1 << newirq)) {
+ msg = "Guessed";
+ irq = newirq;
+ } else
+ return 0;
+ }
+ printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
+
+ /* Update IRQ for all devices with the same pirq value */
+ while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
+ pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
+ if (!pin)
+ continue;
+ pin--;
+ info = pirq_get_info(dev2);
+ if (!info)
+ continue;
+ if (info->irq[pin].link == pirq) {
+ /* We refuse to override the dev->irq information. Give a warning! */
+ if ( dev2->irq && dev2->irq != irq && \
+ (!(pci_probe & PCI_USE_PIRQ_MASK) || \
+ ((1 << dev2->irq) & mask)) ) {
+#ifndef CONFIG_PCI_MSI
+ printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
+ pci_name(dev2), dev2->irq, irq);
+#endif
+ continue;
+ }
+ dev2->irq = irq;
+ pirq_penalty[irq]++;
+ if (dev != dev2)
+ printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
+ }
+ }
+ return 1;
+}
+
+static void __init pcibios_fixup_irqs(void)
+{
+ struct pci_dev *dev = NULL;
+ u8 pin;
+
+ DBG(KERN_DEBUG "PCI: IRQ fixup\n");
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ /*
+ * If the BIOS has set an out of range IRQ number, just ignore it.
+ * Also keep track of which IRQ's are already in use.
+ */
+ if (dev->irq >= 16) {
+ DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
+ dev->irq = 0;
+ }
+ /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
+ if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
+ pirq_penalty[dev->irq] = 0;
+ pirq_penalty[dev->irq]++;
+ }
+
+ dev = NULL;
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Recalculate IRQ numbers if we use the I/O APIC.
+ */
+ if (io_apic_assign_pci_irqs)
+ {
+ int irq;
+
+ if (pin) {
+ pin--; /* interrupt pins are numbered starting from 1 */
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+ /*
+ * Busses behind bridges are typically not listed in the MP-table.
+ * In this case we have to look up the IRQ based on the parent bus,
+ * parent slot, and pin number. The SMP code detects such bridged
+ * busses itself so we should get into this branch reliably.
+ */
+ if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
+ struct pci_dev * bridge = dev->bus->self;
+
+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+ PCI_SLOT(bridge->devfn), pin);
+ if (irq >= 0)
+ printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
+ pci_name(bridge), 'A' + pin, irq);
+ }
+ if (irq >= 0) {
+ printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
+ pci_name(dev), 'A' + pin, irq);
+ dev->irq = irq;
+ }
+ }
+ }
+#endif
+ /*
+ * Still no IRQ? Try to lookup one...
+ */
+ if (pin && !dev->irq)
+ pcibios_lookup_irq(dev, 0);
+ }
+}
+
+/*
+ * Work around broken HP Pavilion Notebooks which assign USB to
+ * IRQ 9 even though it is actually wired to IRQ 11
+ */
+static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
+{
+ if (!broken_hp_bios_irq9) {
+ broken_hp_bios_irq9 = 1;
+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+ }
+ return 0;
+}
+
+/*
+ * Work around broken Acer TravelMate 360 Notebooks which assign
+ * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
+ */
+static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
+{
+ if (!acer_tm360_irqrouting) {
+ acer_tm360_irqrouting = 1;
+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+ }
+ return 0;
+}
+
+static struct dmi_system_id __initdata pciirq_dmi_table[] = {
+ {
+ .callback = fix_broken_hp_bios_irq9,
+ .ident = "HP Pavilion N5400 Series Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
+ DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
+ },
+ },
+ {
+ .callback = fix_acer_tm360_irqrouting,
+ .ident = "Acer TravelMate 36x Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+ },
+ },
+ { }
+};
+
+static int __init pcibios_irq_init(void)
+{
+ DBG(KERN_DEBUG "PCI: IRQ init\n");
+
+ if (pcibios_enable_irq || raw_pci_ops == NULL)
+ return 0;
+
+ dmi_check_system(pciirq_dmi_table);
+
+ pirq_table = pirq_find_routing_table();
+
+#ifdef CONFIG_PCI_BIOS
+ if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
+ pirq_table = pcibios_get_irq_routing_table();
+#endif
+ if (pirq_table) {
+ pirq_peer_trick();
+ pirq_find_router(&pirq_router);
+ if (pirq_table->exclusive_irqs) {
+ int i;
+ for (i=0; i<16; i++)
+ if (!(pirq_table->exclusive_irqs & (1 << i)))
+ pirq_penalty[i] += 100;
+ }
+ /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
+ if (io_apic_assign_pci_irqs)
+ pirq_table = NULL;
+ }
+
+ pcibios_enable_irq = pirq_enable_irq;
+
+ pcibios_fixup_irqs();
+ return 0;
+}
+
+subsys_initcall(pcibios_irq_init);
+
+
+static void pirq_penalize_isa_irq(int irq, int active)
+{
+ /*
+ * If any ISAPnP device reports an IRQ in its list of possible
+ * IRQ's, we try to avoid assigning it to PCI devices.
+ */
+ if (irq < 16) {
+ if (active)
+ pirq_penalty[irq] += 1000;
+ else
+ pirq_penalty[irq] += 100;
+ }
+}
+
+void pcibios_penalize_isa_irq(int irq, int active)
+{
+#ifdef CONFIG_ACPI
+ if (!acpi_noirq)
+ acpi_penalize_isa_irq(irq, active);
+ else
+#endif
+ pirq_penalize_isa_irq(irq, active);
+}
+
+static int pirq_enable_irq(struct pci_dev *dev)
+{
+ u8 pin;
+ struct pci_dev *temp_dev;
+
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+ char *msg = "";
+
+ pin--; /* interrupt pins are numbered starting from 1 */
+
+ if (io_apic_assign_pci_irqs) {
+ int irq;
+
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+ /*
+ * Busses behind bridges are typically not listed in the MP-table.
+ * In this case we have to look up the IRQ based on the parent bus,
+ * parent slot, and pin number. The SMP code detects such bridged
+ * busses itself so we should get into this branch reliably.
+ */
+ temp_dev = dev;
+ while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
+ struct pci_dev * bridge = dev->bus->self;
+
+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+ PCI_SLOT(bridge->devfn), pin);
+ if (irq >= 0)
+ printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
+ pci_name(bridge), 'A' + pin, irq);
+ dev = bridge;
+ }
+ dev = temp_dev;
+ if (irq >= 0) {
+ printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
+ pci_name(dev), 'A' + pin, irq);
+ dev->irq = irq;
+ return 0;
+ } else
+ msg = " Probably buggy MP table.";
+ } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
+ msg = "";
+ else
+ msg = " Please try using pci=biosirq.";
+
+ /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
+ return 0;
+
+ printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
+ 'A' + pin, pci_name(dev), msg);
+ }
+ return 0;
+}
--- /dev/null
+/*
+ * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
+ * to support the Xen PCI Frontend's operation
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <asm/acpi.h>
+#include "pci.h"
+
+static int pcifront_enable_irq(struct pci_dev *dev)
+{
+ u8 irq;
+ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
+ dev->irq = irq;
+
+ return 0;
+}
+
+extern u8 pci_cache_line_size;
+
+static int __init pcifront_x86_stub_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* Only install our method if we haven't found real hardware already */
+ if (raw_pci_ops)
+ return 0;
+
+ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
+
+ /* Copied from arch/i386/pci/common.c */
+ pci_cache_line_size = 32 >> 2;
+ if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
+ pci_cache_line_size = 64 >> 2; /* K7 & K8 */
+ else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+ pci_cache_line_size = 128 >> 2; /* P4 */
+
+ /* On x86, we need to disable the normal IRQ routing table and
+ * just ask the backend
+ */
+ pcibios_enable_irq = pcifront_enable_irq;
+ pcibios_disable_irq = NULL;
+
+#ifdef CONFIG_ACPI
+ /* Keep ACPI out of the picture */
+ acpi_noirq = 1;
+#endif
+
+ return 0;
+}
+
+arch_initcall(pcifront_x86_stub_init);
obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o
obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
+
+disabled-obj-$(CONFIG_XEN) := cpu_$(BITS).o
static void fix_processor_context(void)
{
+#ifndef CONFIG_X86_NO_TSS
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(init_tss, cpu);
* 386 hardware has concept of busy TSS or some
* similar stupidity.
*/
+#endif
load_TR_desc(); /* This does ltr */
load_LDT(¤t->active_mm->context); /* This does lldt */
vdso32.so-$(CONFIG_X86_32) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(VDSO32-y) += sysenter
+xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
+xen-vdso32-$(CONFIG_X86_32) += syscall
+vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
--- /dev/null
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+#include <asm/vdso.h>
+#include <asm/proto.h>
+
+#include <xen/interface/callback.h>
+
+enum {
+ VDSO_DISABLED = 0,
+ VDSO_ENABLED = 1,
+ VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT VDSO_COMPAT
+#else
+#define VDSO_DEFAULT VDSO_ENABLED
+#endif
+
+#ifdef CONFIG_X86_64
+#define vdso_enabled sysctl_vsyscall32
+#define arch_setup_additional_pages syscall32_setup_pages
+#endif
+
+/*
+ * This is the difference between the prelinked addresses in the vDSO images
+ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
+ * in the user address space.
+ */
+#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+
+static int __init vdso_setup(char *s)
+{
+ vdso_enabled = simple_strtoul(s, NULL, 0);
+
+ return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+#endif
+
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+ unsigned offset, unsigned size)
+{
+ Elf32_Sym *sym = (void *)ehdr + offset;
+ unsigned nsym = size / sizeof(*sym);
+ unsigned i;
+
+ for(i = 0; i < nsym; i++, sym++) {
+ if (sym->st_shndx == SHN_UNDEF ||
+ sym->st_shndx == SHN_ABS)
+ continue; /* skip */
+
+ if (sym->st_shndx > SHN_LORESERVE) {
+ printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+ sym->st_shndx);
+ continue;
+ }
+
+ switch(ELF_ST_TYPE(sym->st_info)) {
+ case STT_OBJECT:
+ case STT_FUNC:
+ case STT_SECTION:
+ case STT_FILE:
+ sym->st_value += VDSO_ADDR_ADJUST;
+ }
+ }
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+ Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+ for(; dyn->d_tag != DT_NULL; dyn++)
+ switch(dyn->d_tag) {
+ case DT_PLTGOT:
+ case DT_HASH:
+ case DT_STRTAB:
+ case DT_SYMTAB:
+ case DT_RELA:
+ case DT_INIT:
+ case DT_FINI:
+ case DT_REL:
+ case DT_DEBUG:
+ case DT_JMPREL:
+ case DT_VERSYM:
+ case DT_VERDEF:
+ case DT_VERNEED:
+ case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+ /* definitely pointers needing relocation */
+ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+ break;
+
+ case DT_ENCODING ... OLD_DT_LOOS-1:
+ case DT_LOOS ... DT_HIOS-1:
+ /* Tags above DT_ENCODING are pointers if
+ they're even */
+ if (dyn->d_tag >= DT_ENCODING &&
+ (dyn->d_tag & 1) == 0)
+ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+ break;
+
+ case DT_VERDEFNUM:
+ case DT_VERNEEDNUM:
+ case DT_FLAGS_1:
+ case DT_RELACOUNT:
+ case DT_RELCOUNT:
+ case DT_VALRNGLO ... DT_VALRNGHI:
+ /* definitely not pointers */
+ break;
+
+ case OLD_DT_LOOS ... DT_LOOS-1:
+ case DT_HIOS ... DT_VALRNGLO-1:
+ default:
+ if (dyn->d_tag > DT_ENCODING)
+ printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+ dyn->d_tag);
+ break;
+ }
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+ Elf32_Phdr *phdr;
+ Elf32_Shdr *shdr;
+ int i;
+
+ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
+ !elf_check_arch_ia32(ehdr) ||
+ ehdr->e_type != ET_DYN);
+
+ ehdr->e_entry += VDSO_ADDR_ADJUST;
+
+ /* rebase phdrs */
+ phdr = (void *)ehdr + ehdr->e_phoff;
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
+
+ /* relocate dynamic stuff */
+ if (phdr[i].p_type == PT_DYNAMIC)
+ reloc_dyn(ehdr, phdr[i].p_offset);
+ }
+
+ /* rebase sections */
+ shdr = (void *)ehdr + ehdr->e_shoff;
+ for(i = 0; i < ehdr->e_shnum; i++) {
+ if (!(shdr[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ shdr[i].sh_addr += VDSO_ADDR_ADJUST;
+
+ if (shdr[i].sh_type == SHT_SYMTAB ||
+ shdr[i].sh_type == SHT_DYNSYM)
+ reloc_symtab(ehdr, shdr[i].sh_offset,
+ shdr[i].sh_size);
+ }
+}
+
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_default_start, vdso32_default_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+static struct page *vdso32_pages[1];
+
+
+#ifdef CONFIG_X86_64
+
+#if CONFIG_XEN_COMPAT < 0x030200
+static int use_int80 = 1;
+#endif
+static int use_sysenter __read_mostly = -1;
+
+#define vdso32_sysenter() (use_sysenter > 0)
+
+/* May not be __init: called during resume */
+void syscall32_cpu_init(void)
+{
+#ifdef CONFIG_XEN
+ static const struct callback_register cstar = {
+ .type = CALLBACKTYPE_syscall32,
+ .address = (unsigned long)ia32_cstar_target
+ };
+ static const struct callback_register sysenter = {
+ .type = CALLBACKTYPE_sysenter,
+ .address = (unsigned long)ia32_sysenter_target
+ };
+
+ if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
+ (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
+#if CONFIG_XEN_COMPAT < 0x030200
+ return;
+ use_int80 = 0;
+#else
+ BUG();
+#endif
+#endif
+
+ if (use_sysenter < 0) {
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ use_sysenter = 1;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
+ use_sysenter = 1;
+ }
+}
+
+#define compat_uses_vma 1
+
+static inline void map_compat_vdso(int map)
+{
+}
+
+#else /* CONFIG_X86_32 */
+
+#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
+
+extern asmlinkage void ia32pv_cstar_target(void);
+static const struct callback_register cstar = {
+ .type = CALLBACKTYPE_syscall32,
+ .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
+};
+
+void __cpuinit enable_sep_cpu(void)
+{
+ extern asmlinkage void ia32pv_sysenter_target(void);
+ static struct callback_register __cpuinitdata sysenter = {
+ .type = CALLBACKTYPE_sysenter,
+ .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
+ };
+
+ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
+ BUG();
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ return;
+
+ if (xen_feature(XENFEAT_supervisor_mode_kernel))
+ sysenter.address.eip = (unsigned long)ia32_sysenter_target;
+
+ switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
+ case 0:
+ break;
+#if CONFIG_XEN_COMPAT < 0x030200
+ case -ENOSYS:
+ sysenter.type = CALLBACKTYPE_sysenter_deprecated;
+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
+ break;
+#endif
+ default:
+ setup_clear_cpu_cap(X86_FEATURE_SEP);
+ break;
+ }
+}
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+ gate_vma.vm_mm = NULL;
+ gate_vma.vm_start = FIXADDR_USER_START;
+ gate_vma.vm_end = FIXADDR_USER_END;
+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ gate_vma.vm_page_prot = __P101;
+ /*
+ * Make sure the vDSO gets into every core dump.
+ * Dumping its contents makes post-mortem fully interpretable later
+ * without matching up the same kernel and hardware config to see
+ * what PC values meant.
+ */
+ gate_vma.vm_flags |= VM_ALWAYSDUMP;
+ return 0;
+}
+
+#define compat_uses_vma 0
+
+static void map_compat_vdso(int map)
+{
+ static int vdso_mapped;
+
+ if (map == vdso_mapped)
+ return;
+
+ vdso_mapped = map;
+
+ __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
+ map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+ /* flush stray tlbs */
+ flush_tlb_all();
+}
+
+#endif /* CONFIG_X86_64 */
+
+int __init sysenter_setup(void)
+{
+ void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+ const void *vsyscall;
+ size_t vsyscall_len;
+
+ vdso32_pages[0] = virt_to_page(syscall_page);
+
+#ifdef CONFIG_X86_32
+ gate_vma_init();
+#endif
+
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
+ if (use_int80) {
+ extern const char vdso32_int80_start, vdso32_int80_end;
+
+ vsyscall = &vdso32_int80_start;
+ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
+ } else
+#elif defined(CONFIG_X86_32)
+ if (boot_cpu_has(X86_FEATURE_SYSCALL)
+ && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+ || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
+ barrier(); /* until clear_bit()'s constraints are correct ... */
+ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+ extern const char vdso32_syscall_start, vdso32_syscall_end;
+
+ vsyscall = &vdso32_syscall_start;
+ vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+ } else
+#endif
+ if (!vdso32_sysenter()) {
+ vsyscall = &vdso32_default_start;
+ vsyscall_len = &vdso32_default_end - &vdso32_default_start;
+ } else {
+ vsyscall = &vdso32_sysenter_start;
+ vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+ }
+
+ memcpy(syscall_page, vsyscall, vsyscall_len);
+ relocate_vdso(syscall_page);
+
+ return 0;
+}
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret = 0;
+ bool compat;
+
+ if (vdso_enabled == VDSO_DISABLED)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ /* Test compat mode once here, in case someone
+ changes it via sysctl */
+ compat = (vdso_enabled == VDSO_COMPAT);
+
+ map_compat_vdso(compat);
+
+ if (compat)
+ addr = VDSO_HIGH_BASE;
+ else {
+ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+ }
+
+ if (compat_uses_vma || !compat) {
+ /*
+ * MAYWRITE to allow gdb to COW and set breakpoints
+ *
+ * Make sure the vDSO gets into every core dump.
+ * Dumping its contents makes post-mortem fully
+ * interpretable later without matching up the same
+ * kernel and hardware config to see what PC values
+ * meant.
+ */
+ ret = install_special_mapping(mm, addr, PAGE_SIZE,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+ VM_ALWAYSDUMP,
+ vdso32_pages);
+
+ if (ret)
+ goto up_fail;
+ }
+
+ current->mm->context.vdso = (void *)addr;
+ current_thread_info()->sysenter_return =
+ VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+
+ up_fail:
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+#ifdef CONFIG_X86_64
+
+/*
+ * This must be done early in case we have an initrd containing 32-bit
+ * binaries (e.g., hotplug). This could be pushed upstream.
+ */
+core_initcall(sysenter_setup);
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static ctl_table abi_table2[] = {
+ {
+ .procname = "vsyscall32",
+ .data = &sysctl_vsyscall32,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {}
+};
+
+static ctl_table abi_root_table2[] = {
+ {
+ .ctl_name = CTL_ABI,
+ .procname = "abi",
+ .mode = 0555,
+ .child = abi_table2
+ },
+ {}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+ register_sysctl_table(abi_root_table2);
+ return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
+
+#else /* CONFIG_X86_32 */
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+ return "[vdso]";
+ return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+
+ /* Check to see if this task was created in compat vdso mode */
+ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+ return &gate_vma;
+ return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+ const struct vm_area_struct *vma = get_gate_vma(task);
+
+ return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+ return 0;
+}
+
+#endif /* CONFIG_X86_64 */
.incbin "arch/x86/vdso/vdso32-sysenter.so"
vdso32_sysenter_end:
+#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
+ .globl vdso32_int80_start, vdso32_int80_end
+vdso32_int80_start:
+ .incbin "arch/x86/vdso/vdso32-int80.so"
+vdso32_int80_end:
+#elif defined(CONFIG_X86_XEN)
+ .globl vdso32_syscall_start, vdso32_syscall_end
+vdso32_syscall_start:
+ .incbin "arch/x86/vdso/vdso32-syscall.so"
+vdso32_syscall_end:
+#endif
+
__FINIT
.long LINUX_VERSION_CODE
ELFNOTE_END
-#ifdef CONFIG_XEN
+#if defined(CONFIG_X86_XEN) || defined(CONFIG_PARAVIRT_XEN)
/*
* Add a special note telling glibc's dynamic linker a fake hardware
* flavor that it will use to choose the search path for libraries in the
ELFNOTE_START(GNU, 2, "a")
.long 1 /* ncaps */
+#ifdef CONFIG_PARAVIRT_XEN
VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
.long 0 /* mask */
+#else
+ .long 1 << VDSO_NOTE_NONEGSEG_BIT /* mask */
+#endif
.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
ELFNOTE_END
#endif
.Lpush_ebp:
movl %ecx, %ebp
syscall
+#ifndef CONFIG_XEN
movl $__USER32_DS, %ecx
movl %ecx, %ss
+#endif
movl %ebp, %ecx
popl %ebp
.Lpop_ebp:
# This Kconfig describes xen options
#
-config XEN
+config PARAVIRT_XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
/* Xen-specific pieces of head.S, intended to be included in the right
place in head.S */
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
#include <linux/elfnote.h>
#include <linux/init.h>
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
-#endif /*CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
obj-$(CONFIG_PNP) += pnp/
obj-$(CONFIG_ARM_AMBA) += amba/
-obj-$(CONFIG_XEN) += xen/
+obj-$(CONFIG_PARAVIRT_XEN) += xen/
# char/ comes before serial/ etc so that the VT console is the boot-time
# default.
obj-$(CONFIG_NUBUS) += nubus/
obj-$(CONFIG_ATM) += atm/
obj-y += macintosh/
+obj-$(CONFIG_XEN) += xen/
obj-$(CONFIG_IDE) += ide/
obj-$(CONFIG_SCSI) += scsi/
obj-$(CONFIG_ATA) += ata/
config X86_PM_TIMER
bool "Power Management Timer Support" if EMBEDDED
depends on X86
+ depends on !XEN
default y
help
The Power Management Timer is available on all ACPI-capable,
This driver adds support for the Smart Battery System, another
type of access to battery information, found on some laptops.
+config ACPI_PV_SLEEP
+ bool
+ depends on X86 && XEN && ACPI_SLEEP
+ default y
endif # ACPI
u32 PM1Bcontrol;
struct acpi_bit_register_info *sleep_type_reg_info;
struct acpi_bit_register_info *sleep_enable_reg_info;
+#if !(defined(CONFIG_XEN) && defined(CONFIG_X86))
u32 in_value;
+#else
+ int err;
+#endif
struct acpi_object_list arg_list;
union acpi_object arg;
acpi_status status;
ACPI_FLUSH_CPU_CACHE();
+#if !(defined(CONFIG_XEN) && defined(CONFIG_X86))
status = acpi_hw_register_write(ACPI_REGISTER_PM1A_CONTROL,
PM1Acontrol);
if (ACPI_FAILURE(status)) {
/* Spin until we wake */
} while (!in_value);
+#else
+ /* PV ACPI just need check hypercall return value */
+ err = acpi_notify_hypervisor_state(sleep_state,
+ PM1Acontrol, PM1Bcontrol);
+ if (err) {
+ ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+ "Hypervisor failure [%d]\n", err));
+ return_ACPI_STATUS(AE_ERROR);
+ }
+#endif
return_ACPI_STATUS(AE_OK);
}
* THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED
*
******************************************************************************/
+#ifndef CONFIG_XEN
acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void)
{
u32 in_value;
}
ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_s4bios)
+#endif
/*******************************************************************************
*
static int acpi_sleep_prepare(u32 acpi_state)
{
#ifdef CONFIG_ACPI_SLEEP
+#ifndef CONFIG_ACPI_PV_SLEEP
/* do we have a wakeup address for S2 and S3? */
if (acpi_state == ACPI_STATE_S3) {
if (!acpi_wakeup_address) {
(acpi_physical_address)acpi_wakeup_address);
}
+#endif
ACPI_FLUSH_CPU_CACHE();
acpi_enable_wakeup_device_prep(acpi_state);
#endif
break;
case ACPI_STATE_S3:
+#ifdef CONFIG_ACPI_PV_SLEEP
+ /* Hyperviosr will save and restore CPU context
+ * and then we can skip low level housekeeping here.
+ */
+ acpi_enter_sleep_state(acpi_state);
+#else
do_suspend_lowlevel();
+#endif
break;
}
acpi_target_sleep_state = ACPI_STATE_S0;
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP)
if (init_8259A_after_S1) {
printk("Broken toshiba laptop -> kicking interrupts\n");
init_8259A(0);
help
Include support for the Xilinx SystemACE CompactFlash interface
-config XEN_BLKDEV_FRONTEND
+config XEN_BLKFRONT
tristate "Xen virtual block device support"
- depends on XEN
+ depends on PARAVIRT_XEN
default y
help
This driver implements the front-end of the Xen virtual
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
obj-$(CONFIG_BLK_DEV_UB) += ub.o
-obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
+obj-$(CONFIG_XEN_BLKFRONT) += xen-blkfront.o
obj-$(CONFIG_CIPHER_TWOFISH) += loop_fish2.o
static struct xenbus_driver blkfront = {
.name = "vbd",
- .owner = THIS_MODULE,
.ids = blkfront_ids,
.probe = blkfront_probe,
.remove = blkfront_remove,
config HVC_XEN
bool "Xen Hypervisor Console support"
- depends on XEN
+ depends on PARAVIRT_XEN
select HVC_DRIVER
default y
help
if (page == NULL)
return NULL;
+#ifdef CONFIG_XEN
+ if (xen_create_contiguous_region((unsigned long)page_address(page), 2, 32)) {
+ __free_pages(page, 2);
+ return NULL;
+ }
+#endif
+
if (set_pages_uc(page, 4) < 0) {
set_pages_wb(page, 4);
__free_pages(page, 2);
page = virt_to_page(addr);
set_pages_wb(page, 4);
+#ifdef CONFIG_XEN
+ xen_destroy_contiguous_region((unsigned long)page_address(page), 2);
+#endif
put_page(page);
__free_pages(page, 2);
atomic_dec(&agp_bridge->current_memory_agp);
{
}
+#ifndef ARCH_HAS_DEV_MEM
/*
* This funcion reads the *physical* memory. The f_pos points directly to the
* memory location.
*ppos += written;
return written;
}
+#endif
int __attribute__((weak)) phys_mem_access_prot_allowed(struct file *file,
unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
static int mmap_kmem(struct file * file, struct vm_area_struct * vma)
{
unsigned long pfn;
+#ifdef CONFIG_XEN
+ unsigned long i, count;
+#endif
/* Turn a kernel-virtual address into a physical page frame */
pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;
if (!pfn_valid(pfn))
return -EIO;
+#ifdef CONFIG_XEN
+ count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ for (i = 0; i < count; i++)
+ if ((pfn + i) != mfn_to_local_pfn(pfn_to_mfn(pfn + i)))
+ return -EIO;
+#endif
+
vma->vm_pgoff = pfn;
return mmap_mem(file, vma);
}
#define open_kmem open_mem
#define open_oldmem open_mem
+#ifndef ARCH_HAS_DEV_MEM
static const struct file_operations mem_fops = {
.llseek = memory_lseek,
.read = read_mem,
.open = open_mem,
.get_unmapped_area = get_unmapped_area_mem,
};
+#else
+extern const struct file_operations mem_fops;
+#endif
#ifdef CONFIG_DEVKMEM
static const struct file_operations kmem_fops = {
Further information on this driver and the supported hardware
can be found at http://www.prosec.rub.de/tpm
+config TCG_XEN
+ tristate "XEN TPM Interface"
+ depends on XEN
+ ---help---
+ If you want to make TPM support available to a Xen user domain,
+ say Yes and it will be accessible from within Linux.
+ To compile this driver as a module, choose M here; the module
+ will be called tpm_xenu.
+
endif # TCG_TPM
obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
+obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
+tpm_xenu-y = tpm_xen.o tpm_vtpm.o
struct dentry **bios_dir;
struct list_head list;
+#ifdef CONFIG_XEN
+ void *priv;
+#endif
void (*release) (struct device *);
};
outb(value & 0xFF, base+1);
}
+#ifdef CONFIG_XEN
+static inline void *chip_get_private(const struct tpm_chip *chip)
+{
+ return chip->priv;
+}
+
+static inline void chip_set_private(struct tpm_chip *chip, void *priv)
+{
+ chip->priv = priv;
+}
+#endif
+
extern void tpm_get_timeouts(struct tpm_chip *);
extern void tpm_gen_interrupt(struct tpm_chip *);
extern void tpm_continue_selftest(struct tpm_chip *);
--- /dev/null
+/*
+ * Copyright (C) 2006 IBM Corporation
+ *
+ * Authors:
+ * Stefan Berger <stefanb@us.ibm.com>
+ *
+ * Generic device driver part for device drivers in a virtualized
+ * environment.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include "tpm.h"
+#include "tpm_vtpm.h"
+
+/* read status bits */
+enum {
+ STATUS_BUSY = 0x01,
+ STATUS_DATA_AVAIL = 0x02,
+ STATUS_READY = 0x04
+};
+
+struct transmission {
+ struct list_head next;
+
+ unsigned char *request;
+ size_t request_len;
+ size_t request_buflen;
+
+ unsigned char *response;
+ size_t response_len;
+ size_t response_buflen;
+
+ unsigned int flags;
+};
+
+enum {
+ TRANSMISSION_FLAG_WAS_QUEUED = 0x1
+};
+
+
+enum {
+ DATAEX_FLAG_QUEUED_ONLY = 0x1
+};
+
+
+/* local variables */
+
+/* local function prototypes */
+static int _vtpm_send_queued(struct tpm_chip *chip);
+
+
+/* =============================================================
+ * Some utility functions
+ * =============================================================
+ */
+static void vtpm_state_init(struct vtpm_state *vtpms)
+{
+ vtpms->current_request = NULL;
+ spin_lock_init(&vtpms->req_list_lock);
+ init_waitqueue_head(&vtpms->req_wait_queue);
+ INIT_LIST_HEAD(&vtpms->queued_requests);
+
+ vtpms->current_response = NULL;
+ spin_lock_init(&vtpms->resp_list_lock);
+ init_waitqueue_head(&vtpms->resp_wait_queue);
+
+ vtpms->disconnect_time = jiffies;
+}
+
+
+static inline struct transmission *transmission_alloc(void)
+{
+ return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
+}
+
+static unsigned char *
+transmission_set_req_buffer(struct transmission *t,
+ unsigned char *buffer, size_t len)
+{
+ if (t->request_buflen < len) {
+ kfree(t->request);
+ t->request = kmalloc(len, GFP_KERNEL);
+ if (!t->request) {
+ t->request_buflen = 0;
+ return NULL;
+ }
+ t->request_buflen = len;
+ }
+
+ memcpy(t->request, buffer, len);
+ t->request_len = len;
+
+ return t->request;
+}
+
+static unsigned char *
+transmission_set_res_buffer(struct transmission *t,
+ const unsigned char *buffer, size_t len)
+{
+ if (t->response_buflen < len) {
+ kfree(t->response);
+ t->response = kmalloc(len, GFP_ATOMIC);
+ if (!t->response) {
+ t->response_buflen = 0;
+ return NULL;
+ }
+ t->response_buflen = len;
+ }
+
+ memcpy(t->response, buffer, len);
+ t->response_len = len;
+
+ return t->response;
+}
+
+static inline void transmission_free(struct transmission *t)
+{
+ kfree(t->request);
+ kfree(t->response);
+ kfree(t);
+}
+
+/* =============================================================
+ * Interface with the lower layer driver
+ * =============================================================
+ */
+/*
+ * Lower layer uses this function to make a response available.
+ */
+int vtpm_vd_recv(const struct tpm_chip *chip,
+ const unsigned char *buffer, size_t count,
+ void *ptr)
+{
+ unsigned long flags;
+ int ret_size = 0;
+ struct transmission *t;
+ struct vtpm_state *vtpms;
+
+ vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+ /*
+ * The list with requests must contain one request
+ * only and the element there must be the one that
+ * was passed to me from the front-end.
+ */
+ spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+ if (vtpms->current_request != ptr) {
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+ return 0;
+ }
+
+ if ((t = vtpms->current_request)) {
+ transmission_free(t);
+ vtpms->current_request = NULL;
+ }
+
+ t = transmission_alloc();
+ if (t) {
+ if (!transmission_set_res_buffer(t, buffer, count)) {
+ transmission_free(t);
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+ return -ENOMEM;
+ }
+ ret_size = count;
+ vtpms->current_response = t;
+ wake_up_interruptible(&vtpms->resp_wait_queue);
+ }
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+
+ return ret_size;
+}
+
+
+/*
+ * Lower layer indicates its status (connected/disconnected)
+ */
+void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
+{
+ struct vtpm_state *vtpms;
+
+ vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+ vtpms->vd_status = vd_status;
+ if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
+ vtpms->disconnect_time = jiffies;
+ }
+}
+
+/* =============================================================
+ * Interface with the generic TPM driver
+ * =============================================================
+ */
+static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+ int rc = 0;
+ unsigned long flags;
+ struct vtpm_state *vtpms;
+
+ vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+ /*
+ * Check if the previous operation only queued the command
+ * In this case there won't be a response, so I just
+ * return from here and reset that flag. In any other
+ * case I should receive a response from the back-end.
+ */
+ spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+ if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
+ vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+ /*
+ * The first few commands (measurements) must be
+ * queued since it might not be possible to talk to the
+ * TPM, yet.
+ * Return a response of up to 30 '0's.
+ */
+
+ count = min_t(size_t, count, 30);
+ memset(buf, 0x0, count);
+ return count;
+ }
+ /*
+ * Check whether something is in the responselist and if
+ * there's nothing in the list wait for something to appear.
+ */
+
+ if (!vtpms->current_response) {
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+ interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
+ 1000);
+ spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
+ }
+
+ if (vtpms->current_response) {
+ struct transmission *t = vtpms->current_response;
+ vtpms->current_response = NULL;
+ rc = min(count, t->response_len);
+ memcpy(buf, t->response, rc);
+ transmission_free(t);
+ }
+
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+ return rc;
+}
+
+static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+ int rc = 0;
+ unsigned long flags;
+ struct transmission *t = transmission_alloc();
+ struct vtpm_state *vtpms;
+
+ vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+ if (!t)
+ return -ENOMEM;
+ /*
+ * If there's a current request, it must be the
+ * previous request that has timed out.
+ */
+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
+ if (vtpms->current_request != NULL) {
+ printk("WARNING: Sending although there is a request outstanding.\n"
+ " Previous request must have timed out.\n");
+ transmission_free(vtpms->current_request);
+ vtpms->current_request = NULL;
+ }
+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+ /*
+ * Queue the packet if the driver below is not
+ * ready, yet, or there is any packet already
+ * in the queue.
+ * If the driver below is ready, unqueue all
+ * packets first before sending our current
+ * packet.
+ * For each unqueued packet, except for the
+ * last (=current) packet, call the function
+ * tpm_xen_recv to wait for the response to come
+ * back.
+ */
+ if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
+ if (time_after(jiffies,
+ vtpms->disconnect_time + HZ * 10)) {
+ rc = -ENOENT;
+ } else {
+ goto queue_it;
+ }
+ } else {
+ /*
+ * Send all queued packets.
+ */
+ if (_vtpm_send_queued(chip) == 0) {
+
+ vtpms->current_request = t;
+
+ rc = vtpm_vd_send(vtpms->tpm_private,
+ buf,
+ count,
+ t);
+ /*
+ * The generic TPM driver will call
+ * the function to receive the response.
+ */
+ if (rc < 0) {
+ vtpms->current_request = NULL;
+ goto queue_it;
+ }
+ } else {
+queue_it:
+ if (!transmission_set_req_buffer(t, buf, count)) {
+ transmission_free(t);
+ rc = -ENOMEM;
+ goto exit;
+ }
+ /*
+ * An error occurred. Don't event try
+ * to send the current request. Just
+ * queue it.
+ */
+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
+ vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
+ list_add_tail(&t->next, &vtpms->queued_requests);
+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+ }
+ }
+
+exit:
+ return rc;
+}
+
+
+/*
+ * Send all queued requests.
+ */
+static int _vtpm_send_queued(struct tpm_chip *chip)
+{
+ int rc;
+ int error = 0;
+ long flags;
+ unsigned char buffer[1];
+ struct vtpm_state *vtpms;
+ vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
+
+ while (!list_empty(&vtpms->queued_requests)) {
+ /*
+ * Need to dequeue them.
+ * Read the result into a dummy buffer.
+ */
+ struct transmission *qt = (struct transmission *)
+ vtpms->queued_requests.next;
+ list_del(&qt->next);
+ vtpms->current_request = qt;
+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+ rc = vtpm_vd_send(vtpms->tpm_private,
+ qt->request,
+ qt->request_len,
+ qt);
+
+ if (rc < 0) {
+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
+ if ((qt = vtpms->current_request) != NULL) {
+ /*
+ * requeue it at the beginning
+ * of the list
+ */
+ list_add(&qt->next,
+ &vtpms->queued_requests);
+ }
+ vtpms->current_request = NULL;
+ error = 1;
+ break;
+ }
+ /*
+ * After this point qt is not valid anymore!
+ * It is freed when the front-end is delivering
+ * the data by calling tpm_recv
+ */
+ /*
+ * Receive response into provided dummy buffer
+ */
+ rc = vtpm_recv(chip, buffer, sizeof(buffer));
+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+ return error;
+}
+
+static void vtpm_cancel(struct tpm_chip *chip)
+{
+ unsigned long flags;
+ struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+ spin_lock_irqsave(&vtpms->resp_list_lock,flags);
+
+ if (!vtpms->current_response && vtpms->current_request) {
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+ interruptible_sleep_on(&vtpms->resp_wait_queue);
+ spin_lock_irqsave(&vtpms->resp_list_lock,flags);
+ }
+
+ if (vtpms->current_response) {
+ struct transmission *t = vtpms->current_response;
+ vtpms->current_response = NULL;
+ transmission_free(t);
+ }
+
+ spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
+}
+
+static u8 vtpm_status(struct tpm_chip *chip)
+{
+ u8 rc = 0;
+ unsigned long flags;
+ struct vtpm_state *vtpms;
+
+ vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+ spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+ /*
+ * Data are available if:
+ * - there's a current response
+ * - the last packet was queued only (this is fake, but necessary to
+ * get the generic TPM layer to call the receive function.)
+ */
+ if (vtpms->current_response ||
+ 0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
+ rc = STATUS_DATA_AVAIL;
+ } else if (!vtpms->current_response && !vtpms->current_request) {
+ rc = STATUS_READY;
+ }
+
+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+ return rc;
+}
+
+static struct file_operations vtpm_ops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .open = tpm_open,
+ .read = tpm_read,
+ .write = tpm_write,
+ .release = tpm_release,
+};
+
+static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
+static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
+static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
+static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
+static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
+ NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
+static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
+
+static struct attribute *vtpm_attrs[] = {
+ &dev_attr_pubek.attr,
+ &dev_attr_pcrs.attr,
+ &dev_attr_enabled.attr,
+ &dev_attr_active.attr,
+ &dev_attr_owned.attr,
+ &dev_attr_temp_deactivated.attr,
+ &dev_attr_caps.attr,
+ &dev_attr_cancel.attr,
+ NULL,
+};
+
+static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
+
+#define TPM_LONG_TIMEOUT (10 * 60 * HZ)
+
+static struct tpm_vendor_specific tpm_vtpm = {
+ .recv = vtpm_recv,
+ .send = vtpm_send,
+ .cancel = vtpm_cancel,
+ .status = vtpm_status,
+ .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
+ .req_complete_val = STATUS_DATA_AVAIL,
+ .req_canceled = STATUS_READY,
+ .attr_group = &vtpm_attr_grp,
+ .miscdev = {
+ .fops = &vtpm_ops,
+ },
+ .duration = {
+ TPM_LONG_TIMEOUT,
+ TPM_LONG_TIMEOUT,
+ TPM_LONG_TIMEOUT,
+ },
+};
+
+struct tpm_chip *init_vtpm(struct device *dev,
+ struct tpm_private *tp)
+{
+ long rc;
+ struct tpm_chip *chip;
+ struct vtpm_state *vtpms;
+
+ vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
+ if (!vtpms)
+ return ERR_PTR(-ENOMEM);
+
+ vtpm_state_init(vtpms);
+ vtpms->tpm_private = tp;
+
+ chip = tpm_register_hardware(dev, &tpm_vtpm);
+ if (!chip) {
+ rc = -ENODEV;
+ goto err_free_mem;
+ }
+
+ chip_set_private(chip, vtpms);
+
+ return chip;
+
+err_free_mem:
+ kfree(vtpms);
+
+ return ERR_PTR(rc);
+}
+
+void cleanup_vtpm(struct device *dev)
+{
+ struct tpm_chip *chip = dev_get_drvdata(dev);
+ struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
+ tpm_remove_hardware(dev);
+ kfree(vtpms);
+}
--- /dev/null
+#ifndef TPM_VTPM_H
+#define TPM_VTPM_H
+
+struct tpm_chip;
+struct tpm_private;
+
+struct vtpm_state {
+ struct transmission *current_request;
+ spinlock_t req_list_lock;
+ wait_queue_head_t req_wait_queue;
+
+ struct list_head queued_requests;
+
+ struct transmission *current_response;
+ spinlock_t resp_list_lock;
+ wait_queue_head_t resp_wait_queue; // processes waiting for responses
+
+ u8 vd_status;
+ u8 flags;
+
+ unsigned long disconnect_time;
+
+ /*
+ * The following is a private structure of the underlying
+ * driver. It is passed as parameter in the send function.
+ */
+ struct tpm_private *tpm_private;
+};
+
+
+enum vdev_status {
+ TPM_VD_STATUS_DISCONNECTED = 0x0,
+ TPM_VD_STATUS_CONNECTED = 0x1
+};
+
+/* this function is called from tpm_vtpm.c */
+int vtpm_vd_send(struct tpm_private * tp,
+ const u8 * buf, size_t count, void *ptr);
+
+/* these functions are offered by tpm_vtpm.c */
+struct tpm_chip *init_vtpm(struct device *,
+ struct tpm_private *);
+void cleanup_vtpm(struct device *);
+int vtpm_vd_recv(const struct tpm_chip* chip,
+ const unsigned char *buffer, size_t count, void *ptr);
+void vtpm_vd_status(const struct tpm_chip *, u8 status);
+
+static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
+{
+ struct tpm_chip *chip = dev_get_drvdata(dev);
+ struct vtpm_state *vtpms = chip_get_private(chip);
+ return vtpms->tpm_private;
+}
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from drivers/xen/netfront/netfront.c
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+#include <xen/evtchn.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/tpmif.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+#include "tpm.h"
+#include "tpm_vtpm.h"
+
+#undef DEBUG
+
+/* local structures */
+struct tpm_private {
+ struct tpm_chip *chip;
+
+ tpmif_tx_interface_t *tx;
+ atomic_t refcnt;
+ unsigned int irq;
+ u8 is_connected;
+ u8 is_suspended;
+
+ spinlock_t tx_lock;
+
+ struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
+
+ atomic_t tx_busy;
+ void *tx_remember;
+
+ domid_t backend_id;
+ wait_queue_head_t wait_q;
+
+ struct xenbus_device *dev;
+ int ring_ref;
+};
+
+struct tx_buffer {
+ unsigned int size; // available space in data
+ unsigned int len; // used space in data
+ unsigned char *data; // pointer to a page
+};
+
+
+/* locally visible variables */
+static grant_ref_t gref_head;
+static struct tpm_private *my_priv;
+
+/* local function prototypes */
+static irqreturn_t tpmif_int(int irq,
+ void *tpm_priv);
+static void tpmif_rx_action(unsigned long unused);
+static int tpmif_connect(struct xenbus_device *dev,
+ struct tpm_private *tp,
+ domid_t domid);
+static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
+static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
+static void tpmif_free_tx_buffers(struct tpm_private *tp);
+static void tpmif_set_connected_state(struct tpm_private *tp,
+ u8 newstate);
+static int tpm_xmit(struct tpm_private *tp,
+ const u8 * buf, size_t count, int userbuffer,
+ void *remember);
+static void destroy_tpmring(struct tpm_private *tp);
+void __exit tpmif_exit(void);
+
+#define DPRINTK(fmt, args...) \
+ pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) \
+ printk(KERN_INFO "xen_tpm_fr: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+ printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args)
+
+#define GRANT_INVALID_REF 0
+
+
+static inline int
+tx_buffer_copy(struct tx_buffer *txb, const u8 *src, int len,
+ int isuserbuffer)
+{
+ int copied = len;
+
+ if (len > txb->size)
+ copied = txb->size;
+ if (isuserbuffer) {
+ if (copy_from_user(txb->data, src, copied))
+ return -EFAULT;
+ } else {
+ memcpy(txb->data, src, copied);
+ }
+ txb->len = len;
+ return copied;
+}
+
+static inline struct tx_buffer *tx_buffer_alloc(void)
+{
+ struct tx_buffer *txb;
+
+ txb = kzalloc(sizeof(struct tx_buffer), GFP_KERNEL);
+ if (!txb)
+ return NULL;
+
+ txb->len = 0;
+ txb->size = PAGE_SIZE;
+ txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
+ if (txb->data == NULL) {
+ kfree(txb);
+ txb = NULL;
+ }
+
+ return txb;
+}
+
+
+static inline void tx_buffer_free(struct tx_buffer *txb)
+{
+ if (txb) {
+ free_page((long)txb->data);
+ kfree(txb);
+ }
+}
+
+/**************************************************************
+ Utility function for the tpm_private structure
+**************************************************************/
+static void tpm_private_init(struct tpm_private *tp)
+{
+ spin_lock_init(&tp->tx_lock);
+ init_waitqueue_head(&tp->wait_q);
+ atomic_set(&tp->refcnt, 1);
+}
+
+static void tpm_private_put(void)
+{
+ if (!atomic_dec_and_test(&my_priv->refcnt))
+ return;
+
+ tpmif_free_tx_buffers(my_priv);
+ kfree(my_priv);
+ my_priv = NULL;
+}
+
+static struct tpm_private *tpm_private_get(void)
+{
+ int err;
+
+ if (my_priv) {
+ atomic_inc(&my_priv->refcnt);
+ return my_priv;
+ }
+
+ my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
+ if (!my_priv)
+ return NULL;
+
+ tpm_private_init(my_priv);
+ err = tpmif_allocate_tx_buffers(my_priv);
+ if (err < 0)
+ tpm_private_put();
+
+ return my_priv;
+}
+
+/**************************************************************
+
+ The interface to let the tpm plugin register its callback
+ function and send data to another partition using this module
+
+**************************************************************/
+
+static DEFINE_MUTEX(suspend_lock);
+/*
+ * Send data via this module by calling this function
+ */
+int vtpm_vd_send(struct tpm_private *tp,
+ const u8 * buf, size_t count, void *ptr)
+{
+ int sent;
+
+ mutex_lock(&suspend_lock);
+ sent = tpm_xmit(tp, buf, count, 0, ptr);
+ mutex_unlock(&suspend_lock);
+
+ return sent;
+}
+
+/**************************************************************
+ XENBUS support code
+**************************************************************/
+
+static int setup_tpmring(struct xenbus_device *dev,
+ struct tpm_private *tp)
+{
+ tpmif_tx_interface_t *sring;
+ int err;
+
+ tp->ring_ref = GRANT_INVALID_REF;
+
+ sring = (void *)__get_free_page(GFP_KERNEL);
+ if (!sring) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+ return -ENOMEM;
+ }
+ tp->tx = sring;
+
+ err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
+ if (err < 0) {
+ free_page((unsigned long)sring);
+ tp->tx = NULL;
+ xenbus_dev_fatal(dev, err, "allocating grant reference");
+ goto fail;
+ }
+ tp->ring_ref = err;
+
+ err = tpmif_connect(dev, tp, dev->otherend_id);
+ if (err)
+ goto fail;
+
+ return 0;
+fail:
+ destroy_tpmring(tp);
+ return err;
+}
+
+
+static void destroy_tpmring(struct tpm_private *tp)
+{
+ tpmif_set_connected_state(tp, 0);
+
+ if (tp->ring_ref != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access(tp->ring_ref, (unsigned long)tp->tx);
+ tp->ring_ref = GRANT_INVALID_REF;
+ tp->tx = NULL;
+ }
+
+ if (tp->irq)
+ unbind_from_irqhandler(tp->irq, tp);
+
+ tp->irq = 0;
+}
+
+
+static int talk_to_backend(struct xenbus_device *dev,
+ struct tpm_private *tp)
+{
+ const char *message = NULL;
+ int err;
+ struct xenbus_transaction xbt;
+
+ err = setup_tpmring(dev, tp);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "setting up ring");
+ goto out;
+ }
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ goto destroy_tpmring;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-ref","%u", tp->ring_ref);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+ irq_to_evtchn_port(tp->irq));
+ if (err) {
+ message = "writing event-channel";
+ goto abort_transaction;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ xenbus_dev_fatal(dev, err, "completing transaction");
+ goto destroy_tpmring;
+ }
+
+ xenbus_switch_state(dev, XenbusStateConnected);
+
+ return 0;
+
+abort_transaction:
+ xenbus_transaction_end(xbt, 1);
+ if (message)
+ xenbus_dev_error(dev, err, "%s", message);
+destroy_tpmring:
+ destroy_tpmring(tp);
+out:
+ return err;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+{
+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+ DPRINTK("\n");
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitWait:
+ case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ break;
+
+ case XenbusStateConnected:
+ tpmif_set_connected_state(tp, 1);
+ break;
+
+ case XenbusStateClosing:
+ tpmif_set_connected_state(tp, 0);
+ xenbus_frontend_closed(dev);
+ break;
+
+ case XenbusStateClosed:
+ tpmif_set_connected_state(tp, 0);
+ if (tp->is_suspended == 0)
+ device_unregister(&dev->dev);
+ xenbus_frontend_closed(dev);
+ break;
+ }
+}
+
+static int tpmfront_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ int handle;
+ struct tpm_private *tp = tpm_private_get();
+
+ if (!tp)
+ return -ENOMEM;
+
+ tp->chip = init_vtpm(&dev->dev, tp);
+ if (IS_ERR(tp->chip))
+ return PTR_ERR(tp->chip);
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename,
+ "handle", "%i", &handle);
+ if (XENBUS_EXIST_ERR(err))
+ return err;
+
+ if (err < 0) {
+ xenbus_dev_fatal(dev,err,"reading virtual-device");
+ return err;
+ }
+
+ tp->dev = dev;
+
+ err = talk_to_backend(dev, tp);
+ if (err) {
+ tpm_private_put();
+ return err;
+ }
+
+ return 0;
+}
+
+
+static int tpmfront_remove(struct xenbus_device *dev)
+{
+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+ destroy_tpmring(tp);
+ cleanup_vtpm(&dev->dev);
+ return 0;
+}
+
+static int tpmfront_suspend(struct xenbus_device *dev)
+{
+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+ u32 ctr;
+
+ /* Take the lock, preventing any application from sending. */
+ mutex_lock(&suspend_lock);
+ tp->is_suspended = 1;
+
+ for (ctr = 0; atomic_read(&tp->tx_busy); ctr++) {
+ if ((ctr % 10) == 0)
+ printk("TPM-FE [INFO]: Waiting for outstanding "
+ "request.\n");
+ /* Wait for a request to be responded to. */
+ interruptible_sleep_on_timeout(&tp->wait_q, 100);
+ }
+
+ return 0;
+}
+
+static int tpmfront_suspend_finish(struct tpm_private *tp)
+{
+ tp->is_suspended = 0;
+ /* Allow applications to send again. */
+ mutex_unlock(&suspend_lock);
+ return 0;
+}
+
+static int tpmfront_suspend_cancel(struct xenbus_device *dev)
+{
+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+ return tpmfront_suspend_finish(tp);
+}
+
+static int tpmfront_resume(struct xenbus_device *dev)
+{
+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+ destroy_tpmring(tp);
+ return talk_to_backend(dev, tp);
+}
+
+static int tpmif_connect(struct xenbus_device *dev,
+ struct tpm_private *tp,
+ domid_t domid)
+{
+ int err;
+
+ tp->backend_id = domid;
+
+ err = bind_listening_port_to_irqhandler(
+ domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
+ if (err <= 0) {
+ WPRINTK("bind_listening_port_to_irqhandler failed "
+ "(err=%d)\n", err);
+ return err;
+ }
+ tp->irq = err;
+
+ return 0;
+}
+
+static struct xenbus_device_id tpmfront_ids[] = {
+ { "vtpm" },
+ { "" }
+};
+
+static struct xenbus_driver tpmfront = {
+ .name = "vtpm",
+ .ids = tpmfront_ids,
+ .probe = tpmfront_probe,
+ .remove = tpmfront_remove,
+ .resume = tpmfront_resume,
+ .otherend_changed = backend_changed,
+ .suspend = tpmfront_suspend,
+ .suspend_cancel = tpmfront_suspend_cancel,
+};
+
+static int __init init_tpm_xenbus(void)
+{
+ return xenbus_register_frontend(&tpmfront);
+}
+
+static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
+{
+ unsigned int i;
+
+ for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
+ tp->tx_buffers[i] = tx_buffer_alloc();
+ if (!tp->tx_buffers[i]) {
+ tpmif_free_tx_buffers(tp);
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+static void tpmif_free_tx_buffers(struct tpm_private *tp)
+{
+ unsigned int i;
+
+ for (i = 0; i < TPMIF_TX_RING_SIZE; i++)
+ tx_buffer_free(tp->tx_buffers[i]);
+}
+
+static void tpmif_rx_action(unsigned long priv)
+{
+ struct tpm_private *tp = (struct tpm_private *)priv;
+ int i = 0;
+ unsigned int received;
+ unsigned int offset = 0;
+ u8 *buffer;
+ tpmif_tx_request_t *tx = &tp->tx->ring[i].req;
+
+ atomic_set(&tp->tx_busy, 0);
+ wake_up_interruptible(&tp->wait_q);
+
+ received = tx->size;
+
+ buffer = kmalloc(received, GFP_ATOMIC);
+ if (!buffer)
+ return;
+
+ for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
+ struct tx_buffer *txb = tp->tx_buffers[i];
+ tpmif_tx_request_t *tx;
+ unsigned int tocopy;
+
+ tx = &tp->tx->ring[i].req;
+ tocopy = tx->size;
+ if (tocopy > PAGE_SIZE)
+ tocopy = PAGE_SIZE;
+
+ memcpy(&buffer[offset], txb->data, tocopy);
+
+ gnttab_release_grant_reference(&gref_head, tx->ref);
+
+ offset += tocopy;
+ }
+
+ vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
+ kfree(buffer);
+}
+
+
+static irqreturn_t tpmif_int(int irq, void *tpm_priv)
+{
+ struct tpm_private *tp = tpm_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tp->tx_lock, flags);
+ tpmif_rx_tasklet.data = (unsigned long)tp;
+ tasklet_schedule(&tpmif_rx_tasklet);
+ spin_unlock_irqrestore(&tp->tx_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+
+static int tpm_xmit(struct tpm_private *tp,
+ const u8 * buf, size_t count, int isuserbuffer,
+ void *remember)
+{
+ tpmif_tx_request_t *tx;
+ TPMIF_RING_IDX i;
+ unsigned int offset = 0;
+
+ spin_lock_irq(&tp->tx_lock);
+
+ if (unlikely(atomic_read(&tp->tx_busy))) {
+ printk("tpm_xmit: There's an outstanding request/response "
+ "on the way!\n");
+ spin_unlock_irq(&tp->tx_lock);
+ return -EBUSY;
+ }
+
+ if (tp->is_connected != 1) {
+ spin_unlock_irq(&tp->tx_lock);
+ return -EIO;
+ }
+
+ for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
+ struct tx_buffer *txb = tp->tx_buffers[i];
+ int copied;
+
+ if (!txb) {
+ DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
+ "Not transmitting anything!\n", i);
+ spin_unlock_irq(&tp->tx_lock);
+ return -EFAULT;
+ }
+
+ copied = tx_buffer_copy(txb, &buf[offset], count,
+ isuserbuffer);
+ if (copied < 0) {
+ /* An error occurred */
+ spin_unlock_irq(&tp->tx_lock);
+ return copied;
+ }
+ count -= copied;
+ offset += copied;
+
+ tx = &tp->tx->ring[i].req;
+ tx->addr = virt_to_machine(txb->data);
+ tx->size = txb->len;
+ tx->unused = 0;
+
+ DPRINTK("First 4 characters sent by TPM-FE are "
+ "0x%02x 0x%02x 0x%02x 0x%02x\n",
+ txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
+
+ /* Get the granttable reference for this page. */
+ tx->ref = gnttab_claim_grant_reference(&gref_head);
+ if (tx->ref == -ENOSPC) {
+ spin_unlock_irq(&tp->tx_lock);
+ DPRINTK("Grant table claim reference failed in "
+ "func:%s line:%d file:%s\n",
+ __FUNCTION__, __LINE__, __FILE__);
+ return -ENOSPC;
+ }
+ gnttab_grant_foreign_access_ref(tx->ref,
+ tp->backend_id,
+ virt_to_mfn(txb->data),
+ 0 /*RW*/);
+ wmb();
+ }
+
+ atomic_set(&tp->tx_busy, 1);
+ tp->tx_remember = remember;
+
+ mb();
+
+ notify_remote_via_irq(tp->irq);
+
+ spin_unlock_irq(&tp->tx_lock);
+ return offset;
+}
+
+
+static void tpmif_notify_upperlayer(struct tpm_private *tp)
+{
+ /* Notify upper layer about the state of the connection to the BE. */
+ vtpm_vd_status(tp->chip, (tp->is_connected
+ ? TPM_VD_STATUS_CONNECTED
+ : TPM_VD_STATUS_DISCONNECTED));
+}
+
+
+static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
+{
+ /*
+ * Don't notify upper layer if we are in suspend mode and
+ * should disconnect - assumption is that we will resume
+ * The mutex keeps apps from sending.
+ */
+ if (is_connected == 0 && tp->is_suspended == 1)
+ return;
+
+ /*
+ * Unlock the mutex if we are connected again
+ * after being suspended - now resuming.
+ * This also removes the suspend state.
+ */
+ if (is_connected == 1 && tp->is_suspended == 1)
+ tpmfront_suspend_finish(tp);
+
+ if (is_connected != tp->is_connected) {
+ tp->is_connected = is_connected;
+ tpmif_notify_upperlayer(tp);
+ }
+}
+
+
+
+/* =================================================================
+ * Initialization function.
+ * =================================================================
+ */
+
+
+static int __init tpmif_init(void)
+{
+ struct tpm_private *tp;
+
+ if (is_initial_xendomain())
+ return -EPERM;
+
+ tp = tpm_private_get();
+ if (!tp)
+ return -ENOMEM;
+
+ IPRINTK("Initialising the vTPM driver.\n");
+ if (gnttab_alloc_grant_references(TPMIF_TX_RING_SIZE,
+ &gref_head) < 0) {
+ tpm_private_put();
+ return -EFAULT;
+ }
+
+ init_tpm_xenbus();
+ return 0;
+}
+
+
+module_init(tpmif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
config CPU_FREQ
bool "CPU Frequency scaling"
+ depends on !XEN
help
CPU Frequency scaling allows you to change the clock speed of
CPUs on the fly. This is a nice method to save power, because
{
u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */
+#ifndef CONFIG_XEN
if (!PCI_DMA_BUS_IS_PHYS) {
addr = BLK_BOUNCE_ANY;
} else if (on && drive->media == ide_disk) {
if (dev && dev->dma_mask)
addr = *dev->dma_mask;
}
+#else
+ if (on && drive->media == ide_disk) {
+ struct device *dev = drive->hwif->dev;
+
+ if (!PCI_DMA_BUS_IS_PHYS)
+ addr = BLK_BOUNCE_ANY;
+ else if (dev && dev->dma_mask)
+ addr = *dev->dma_mask;
+ }
+#endif
if (drive->queue)
blk_queue_bounce_limit(drive->queue, addr);
static struct xenbus_driver xenkbd = {
.name = "vkbd",
- .owner = THIS_MODULE,
.ids = xenkbd_ids,
.probe = xenkbd_probe,
.remove = xenkbd_remove,
source "drivers/s390/net/Kconfig"
-config XEN_NETDEV_FRONTEND
+config XEN_NETFRONT
tristate "Xen network device frontend driver"
- depends on XEN
+ depends on PARAVIRT_XEN
default y
help
The network device frontend driver allows the kernel to
obj-$(CONFIG_SLIP) += slip.o
obj-$(CONFIG_SLHC) += slhc.o
-obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
+obj-$(CONFIG_XEN_NETFRONT) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
#include <linux/skbuff.h>
#include <linux/ethtool.h>
#include <linux/if_ether.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <net/ip.h>
return cons;
}
-static int skb_checksum_setup(struct sk_buff *skb)
-{
- struct iphdr *iph;
- unsigned char *th;
- int err = -EPROTO;
-
- if (skb->protocol != htons(ETH_P_IP))
- goto out;
-
- iph = (void *)skb->data;
- th = skb->data + 4 * iph->ihl;
- if (th >= skb_tail_pointer(skb))
- goto out;
-
- skb->csum_start = th - skb->head;
- switch (iph->protocol) {
- case IPPROTO_TCP:
- skb->csum_offset = offsetof(struct tcphdr, check);
- break;
- case IPPROTO_UDP:
- skb->csum_offset = offsetof(struct udphdr, check);
- break;
- default:
- if (net_ratelimit())
- printk(KERN_ERR "Attempting to checksum a non-"
- "TCP/UDP packet, dropping a protocol"
- " %d packet", iph->protocol);
- goto out;
- }
-
- if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
- goto out;
-
- err = 0;
-
-out:
- return err;
-}
-
static int handle_incoming_queue(struct net_device *dev,
struct sk_buff_head *rxq)
{
static struct xenbus_driver netfront = {
.name = "vif",
- .owner = THIS_MODULE,
.ids = netfront_ids,
.probe = netfront_probe,
.remove = __devexit_p(xennet_remove),
*
* @author John Levon <levon@movementarian.org>
*
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
* This is the core of the buffer management. Each
* CPU buffer is processed and entered into the
* global event buffer. Such processing is necessary
static DEFINE_SPINLOCK(task_mortuary);
static void process_task_mortuary(void);
+#ifdef CONFIG_XEN
+static int cpu_current_domain[NR_CPUS];
+#endif
/* Take ownership of the task struct and place it on the
* list for processing. Only after two full buffer syncs
int sync_start(void)
{
int err;
+#ifdef CONFIG_XEN
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ cpu_current_domain[i] = COORDINATOR_DOMAIN;
+ }
+#endif
start_cpu_work();
last_cookie = INVALID_COOKIE;
}
-static void add_kernel_ctx_switch(unsigned int in_kernel)
+static void add_cpu_mode_switch(unsigned int cpu_mode)
{
add_event_entry(ESCAPE_CODE);
- if (in_kernel)
- add_event_entry(KERNEL_ENTER_SWITCH_CODE);
- else
- add_event_entry(KERNEL_EXIT_SWITCH_CODE);
+ switch (cpu_mode) {
+ case CPU_MODE_USER:
+ add_event_entry(USER_ENTER_SWITCH_CODE);
+ break;
+ case CPU_MODE_KERNEL:
+ add_event_entry(KERNEL_ENTER_SWITCH_CODE);
+ break;
+ case CPU_MODE_XEN:
+ add_event_entry(XEN_ENTER_SWITCH_CODE);
+ break;
+ default:
+ break;
+ }
}
-
+
+#ifdef CONFIG_XEN
+static void add_domain_switch(unsigned long domain_id)
+{
+ add_event_entry(ESCAPE_CODE);
+ add_event_entry(DOMAIN_SWITCH_CODE);
+ add_event_entry(domain_id);
+}
+#endif
+
static void
add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
{
* for later lookup from userspace.
*/
static int
-add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
+add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode)
{
- if (in_kernel) {
+ if (cpu_mode >= CPU_MODE_KERNEL) {
add_sample_entry(s->eip, s->event);
return 1;
} else if (mm) {
struct mm_struct *mm = NULL;
struct task_struct * new;
unsigned long cookie = 0;
- int in_kernel = 1;
+ int cpu_mode = 1;
unsigned int i;
sync_buffer_state state = sb_buffer_start;
unsigned long available;
+ int domain_switch = 0;
mutex_lock(&buffer_mutex);
add_cpu_switch(cpu);
+#ifdef CONFIG_XEN
+ /* We need to assign the first samples in this CPU buffer to the
+ same domain that we were processing at the last sync_buffer */
+ if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
+ add_domain_switch(cpu_current_domain[cpu]);
+ }
+#endif
+
/* Remember, only we can modify tail_pos */
available = get_slots(cpu_buf);
for (i = 0; i < available; ++i) {
struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
- if (is_code(s->eip)) {
- if (s->event <= CPU_IS_KERNEL) {
- /* kernel/userspace switch */
- in_kernel = s->event;
+ if (is_code(s->eip) && !domain_switch) {
+ if (s->event <= CPU_MODE_XEN) {
+ /* xen/kernel/userspace switch */
+ cpu_mode = s->event;
if (state == sb_buffer_start)
state = sb_sample_start;
- add_kernel_ctx_switch(s->event);
+ add_cpu_mode_switch(s->event);
} else if (s->event == CPU_TRACE_BEGIN) {
state = sb_bt_start;
add_trace_begin();
+#ifdef CONFIG_XEN
+ } else if (s->event == CPU_DOMAIN_SWITCH) {
+ domain_switch = 1;
+#endif
} else {
struct mm_struct * oldmm = mm;
add_user_ctx_switch(new, cookie);
}
} else {
+#ifdef CONFIG_XEN
+ if (domain_switch) {
+ cpu_current_domain[cpu] = s->eip;
+ add_domain_switch(s->eip);
+ domain_switch = 0;
+ } else if (cpu_current_domain[cpu] !=
+ COORDINATOR_DOMAIN) {
+ add_sample_entry(s->eip, s->event);
+ } else
+#endif
if (state >= sb_bt_start &&
- !add_sample(mm, s, in_kernel)) {
+ !add_sample(mm, s, cpu_mode)) {
if (state == sb_bt_start) {
state = sb_bt_ignore;
atomic_inc(&oprofile_stats.bt_lost_no_mapping);
}
release_mm(mm);
+#ifdef CONFIG_XEN
+ /* We reset domain to COORDINATOR at each CPU switch */
+ if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
+ add_domain_switch(COORDINATOR_DOMAIN);
+ }
+#endif
+
mark_done(cpu);
mutex_unlock(&buffer_mutex);
*
* @author John Levon <levon@movementarian.org>
*
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
* Each CPU has a local buffer that stores PC value/event
* pairs. We also log context switches when we notice them.
* Eventually each CPU's buffer is processed into the global
#define DEFAULT_TIMER_EXPIRE (HZ / 10)
static int work_enabled;
+#ifndef CONFIG_XEN
+#define current_domain COORDINATOR_DOMAIN
+#else
+static int32_t current_domain = COORDINATOR_DOMAIN;
+#endif
+
void free_cpu_buffers(void)
{
int i;
goto fail;
b->last_task = NULL;
- b->last_is_kernel = -1;
+ b->last_cpu_mode = -1;
b->tracing = 0;
b->buffer_size = buffer_size;
b->tail_pos = 0;
* collected will populate the buffer with proper
* values to initialize the buffer
*/
- cpu_buf->last_is_kernel = -1;
+ cpu_buf->last_cpu_mode = -1;
cpu_buf->last_task = NULL;
}
* because of the head/tail separation of the writer and reader
* of the CPU buffer.
*
- * is_kernel is needed because on some architectures you cannot
+ * cpu_mode is needed because on some architectures you cannot
* tell if you are in kernel or user space simply by looking at
- * pc. We tag this in the buffer by generating kernel enter/exit
- * events whenever is_kernel changes
+ * pc. We tag this in the buffer by generating kernel/user (and xen)
+ * enter events whenever cpu_mode changes
*/
static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
- int is_kernel, unsigned long event)
+ int cpu_mode, unsigned long event)
{
struct task_struct * task;
return 0;
}
- is_kernel = !!is_kernel;
-
task = current;
/* notice a switch from user->kernel or vice versa */
- if (cpu_buf->last_is_kernel != is_kernel) {
- cpu_buf->last_is_kernel = is_kernel;
- add_code(cpu_buf, is_kernel);
+ if (cpu_buf->last_cpu_mode != cpu_mode) {
+ cpu_buf->last_cpu_mode = cpu_mode;
+ add_code(cpu_buf, cpu_mode);
}
-
+
/* notice a task switch */
- if (cpu_buf->last_task != task) {
+ /* if not processing other domain samples */
+ if ((cpu_buf->last_task != task) &&
+ (current_domain == COORDINATOR_DOMAIN)) {
cpu_buf->last_task = task;
add_code(cpu_buf, (unsigned long)task);
}
add_sample(cpu_buf, pc, 0);
}
+#ifdef CONFIG_XEN
+int oprofile_add_domain_switch(int32_t domain_id)
+{
+ struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
+
+ /* should have space for switching into and out of domain
+ (2 slots each) plus one sample and one cpu mode switch */
+ if (((nr_available_slots(cpu_buf) < 6) &&
+ (domain_id != COORDINATOR_DOMAIN)) ||
+ (nr_available_slots(cpu_buf) < 2))
+ return 0;
+
+ add_code(cpu_buf, CPU_DOMAIN_SWITCH);
+ add_sample(cpu_buf, domain_id, 0);
+
+ current_domain = domain_id;
+
+ return 1;
+}
+#endif
+
/*
* This serves to avoid cpu buffer overflow, and makes sure
* the task mortuary progresses
volatile unsigned long tail_pos;
unsigned long buffer_size;
struct task_struct * last_task;
- int last_is_kernel;
+ int last_cpu_mode;
int tracing;
struct op_sample * buffer;
unsigned long sample_received;
void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf);
/* transient events for the CPU buffer -> event buffer */
-#define CPU_IS_KERNEL 1
-#define CPU_TRACE_BEGIN 2
+#define CPU_MODE_USER 0
+#define CPU_MODE_KERNEL 1
+#define CPU_MODE_XEN 2
+#define CPU_TRACE_BEGIN 3
+#define CPU_DOMAIN_SWITCH 4
#endif /* OPROFILE_CPU_BUFFER_H */
#define INVALID_COOKIE ~0UL
#define NO_COOKIE 0UL
+/* Constant used to refer to coordinator domain (Xen) */
+#define COORDINATOR_DOMAIN -1
+
extern const struct file_operations event_buffer_fops;
/* mutex between sync_cpu_buffers() and the
* @remark Read the file COPYING
*
* @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
*/
#include <linux/kernel.h>
*/
static int timer = 0;
+#ifdef CONFIG_XEN
+int oprofile_set_active(int active_domains[], unsigned int adomains)
+{
+ int err;
+
+ if (!oprofile_ops.set_active)
+ return -EINVAL;
+
+ mutex_lock(&start_mutex);
+ err = oprofile_ops.set_active(active_domains, adomains);
+ mutex_unlock(&start_mutex);
+ return err;
+}
+
+int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
+{
+ int err;
+
+ if (!oprofile_ops.set_passive)
+ return -EINVAL;
+
+ mutex_lock(&start_mutex);
+ err = oprofile_ops.set_passive(passive_domains, pdomains);
+ mutex_unlock(&start_mutex);
+ return err;
+}
+#endif
+
int oprofile_setup(void)
{
int err;
void oprofile_timer_init(struct oprofile_operations * ops);
int oprofile_set_backtrace(unsigned long depth);
+
+int oprofile_set_active(int active_domains[], unsigned int adomains);
+int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
#endif /* OPROF_H */
* @remark Read the file COPYING
*
* @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
*/
#include <linux/fs.h>
#include <linux/oprofile.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
#include "event_buffer.h"
#include "oprofile_stats.h"
#include "oprof.h"
-
+
unsigned long fs_buffer_size = 131072;
unsigned long fs_cpu_buffer_size = 8192;
unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */
static const struct file_operations dump_fops = {
.write = dump_write,
};
-
+
+#ifdef CONFIG_XEN
+
+#define TMPBUFSIZE 512
+
+static unsigned int adomains = 0;
+static int active_domains[MAX_OPROF_DOMAINS + 1];
+static DEFINE_MUTEX(adom_mutex);
+
+static ssize_t adomain_write(struct file * file, char const __user * buf,
+ size_t count, loff_t * offset)
+{
+ char *tmpbuf;
+ char *startp, *endp;
+ int i;
+ unsigned long val;
+ ssize_t retval = count;
+
+ if (*offset)
+ return -EINVAL;
+ if (count > TMPBUFSIZE - 1)
+ return -EINVAL;
+
+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+ return -ENOMEM;
+
+ if (copy_from_user(tmpbuf, buf, count)) {
+ kfree(tmpbuf);
+ return -EFAULT;
+ }
+ tmpbuf[count] = 0;
+
+ mutex_lock(&adom_mutex);
+
+ startp = tmpbuf;
+ /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
+ for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
+ val = simple_strtoul(startp, &endp, 0);
+ if (endp == startp)
+ break;
+ while (ispunct(*endp) || isspace(*endp))
+ endp++;
+ active_domains[i] = val;
+ if (active_domains[i] != val)
+ /* Overflow, force error below */
+ i = MAX_OPROF_DOMAINS + 1;
+ startp = endp;
+ }
+ /* Force error on trailing junk */
+ adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
+
+ kfree(tmpbuf);
+
+ if (adomains > MAX_OPROF_DOMAINS
+ || oprofile_set_active(active_domains, adomains)) {
+ adomains = 0;
+ retval = -EINVAL;
+ }
+
+ mutex_unlock(&adom_mutex);
+ return retval;
+}
+
+static ssize_t adomain_read(struct file * file, char __user * buf,
+ size_t count, loff_t * offset)
+{
+ char * tmpbuf;
+ size_t len;
+ int i;
+ ssize_t retval;
+
+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+ return -ENOMEM;
+
+ mutex_lock(&adom_mutex);
+
+ len = 0;
+ for (i = 0; i < adomains; i++)
+ len += snprintf(tmpbuf + len,
+ len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
+ "%u ", active_domains[i]);
+ WARN_ON(len > TMPBUFSIZE);
+ if (len != 0 && len <= TMPBUFSIZE)
+ tmpbuf[len-1] = '\n';
+
+ mutex_unlock(&adom_mutex);
+
+ retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
+
+ kfree(tmpbuf);
+ return retval;
+}
+
+
+static struct file_operations active_domain_ops = {
+ .read = adomain_read,
+ .write = adomain_write,
+};
+
+static unsigned int pdomains = 0;
+static int passive_domains[MAX_OPROF_DOMAINS];
+static DEFINE_MUTEX(pdom_mutex);
+
+static ssize_t pdomain_write(struct file * file, char const __user * buf,
+ size_t count, loff_t * offset)
+{
+ char *tmpbuf;
+ char *startp, *endp;
+ int i;
+ unsigned long val;
+ ssize_t retval = count;
+
+ if (*offset)
+ return -EINVAL;
+ if (count > TMPBUFSIZE - 1)
+ return -EINVAL;
+
+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+ return -ENOMEM;
+
+ if (copy_from_user(tmpbuf, buf, count)) {
+ kfree(tmpbuf);
+ return -EFAULT;
+ }
+ tmpbuf[count] = 0;
+
+ mutex_lock(&pdom_mutex);
+
+ startp = tmpbuf;
+ /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
+ for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
+ val = simple_strtoul(startp, &endp, 0);
+ if (endp == startp)
+ break;
+ while (ispunct(*endp) || isspace(*endp))
+ endp++;
+ passive_domains[i] = val;
+ if (passive_domains[i] != val)
+ /* Overflow, force error below */
+ i = MAX_OPROF_DOMAINS + 1;
+ startp = endp;
+ }
+ /* Force error on trailing junk */
+ pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
+
+ kfree(tmpbuf);
+
+ if (pdomains > MAX_OPROF_DOMAINS
+ || oprofile_set_passive(passive_domains, pdomains)) {
+ pdomains = 0;
+ retval = -EINVAL;
+ }
+
+ mutex_unlock(&pdom_mutex);
+ return retval;
+}
+
+static ssize_t pdomain_read(struct file * file, char __user * buf,
+ size_t count, loff_t * offset)
+{
+ char * tmpbuf;
+ size_t len;
+ int i;
+ ssize_t retval;
+
+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+ return -ENOMEM;
+
+ mutex_lock(&pdom_mutex);
+
+ len = 0;
+ for (i = 0; i < pdomains; i++)
+ len += snprintf(tmpbuf + len,
+ len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
+ "%u ", passive_domains[i]);
+ WARN_ON(len > TMPBUFSIZE);
+ if (len != 0 && len <= TMPBUFSIZE)
+ tmpbuf[len-1] = '\n';
+
+ mutex_unlock(&pdom_mutex);
+
+ retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
+
+ kfree(tmpbuf);
+ return retval;
+}
+
+static struct file_operations passive_domain_ops = {
+ .read = pdomain_read,
+ .write = pdomain_write,
+};
+
+#endif /* CONFIG_XEN */
+
void oprofile_create_files(struct super_block * sb, struct dentry * root)
{
oprofilefs_create_file(sb, root, "enable", &enable_fops);
oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
+#ifdef CONFIG_XEN
+ oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
+ oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
+#endif
oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size);
oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
config HT_IRQ
bool "Interrupts on hypertransport devices"
default y
- depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
+ depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
help
This allows native hypertransport devices to use interrupts.
#include "pci.h"
+extern int pci_mem_align;
+
/**
* pci_bus_alloc_resource - allocate a resource from a parent bus
* @bus: PCI bus
type_mask |= IORESOURCE_IO | IORESOURCE_MEM;
+#ifdef CONFIG_XEN
+ /* If the boot parameter 'pci-mem-align' was specified then we need to
+ align the memory addresses, at page size alignment. */
+ if (pci_mem_align && (align < (PAGE_SIZE-1)))
+ align = PAGE_SIZE - 1;
+#endif
+
for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) {
struct resource *r = bus->resource[i];
if (!r)
* Restore the BAR values for a given device, so as to make it
* accessible by its driver.
*/
+#ifndef CONFIG_XEN
static void
+#else
+EXPORT_SYMBOL_GPL(pci_restore_bars);
+void
+#endif
pci_restore_bars(struct pci_dev *dev)
{
int i, numres;
#include <linux/kallsyms.h>
#include "pci.h"
+#ifdef CONFIG_XEN
+/* A global flag which signals if we should page-align PCI mem windows. */
+int pci_mem_align = 0;
+
+static int __init set_pci_mem_align(char *str)
+{
+ pci_mem_align = 1;
+ return 1;
+}
+__setup("pci-mem-align", set_pci_mem_align);
+
+/* This quirk function enables us to force all memory resources which are
+ * assigned to PCI devices, to be page-aligned.
+ */
+static void __devinit quirk_align_mem_resources(struct pci_dev *dev)
+{
+ int i;
+ struct resource *r;
+ resource_size_t old_start;
+
+ if (!pci_mem_align)
+ return;
+
+ for (i=0; i < DEVICE_COUNT_RESOURCE; i++) {
+ r = &dev->resource[i];
+ if ((r == NULL) || !(r->flags & IORESOURCE_MEM))
+ continue;
+
+ old_start = r->start;
+ r->start = (r->start + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+ r->end = r->end - (old_start - r->start);
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, quirk_align_mem_resources);
+#endif
+
/* The Mellanox Tavor device gives false positive parity errors
* Mark this device with a broken_parity_status, to allow
* PCI scanning code to "skip" this now blacklisted device.
config SERIAL_8250
tristate "8250/16550 and compatible serial support"
depends on (BROKEN || !SPARC)
+ depends on !XEN_DISABLE_SERIAL
select SERIAL_CORE
---help---
This selects whether you want to include the driver for the standard
tristate "Cyberblade/i1 support"
depends on FB && PCI && X86_32 && !64BIT
select FB_CFB_IMAGEBLIT
- select VIDEO_SELECT
+ select VIDEO_SELECT if !XEN
---help---
This driver is supposed to support the Trident Cyberblade/i1
graphics core integrated in the VIA VT8601A North Bridge,
config XEN_FBDEV_FRONTEND
tristate "Xen virtual frame buffer support"
- depends on FB && XEN
+ depends on FB && PARAVIRT_XEN
select FB_SYS_FILLRECT
select FB_SYS_COPYAREA
select FB_SYS_IMAGEBLIT
config VIDEO_SELECT
bool "Video mode selection support"
depends on X86 && VGA_CONSOLE
+ depends on !XEN
---help---
This enables support for text mode selection on kernel startup. If
you want to take advantage of some high-resolution text mode your
static struct xenbus_driver xenfb = {
.name = "vfb",
- .owner = THIS_MODULE,
.ids = xenfb_ids,
.probe = xenfb_probe,
.remove = xenfb_remove,
+#
+# This Kconfig describe xen options
+#
+
+config XEN
+ bool
+
+if XEN
+config XEN_INTERFACE_VERSION
+ hex
+ default 0x00030207
+
+menu "XEN"
+
+config XEN_PRIVILEGED_GUEST
+ bool "Privileged Guest (domain 0)"
+ help
+ Support for privileged operation (domain 0)
+
+config XEN_UNPRIVILEGED_GUEST
+ def_bool !XEN_PRIVILEGED_GUEST
+
+config XEN_PRIVCMD
+ def_bool y
+ depends on PROC_FS
+
+config XEN_XENBUS_DEV
+ def_bool y
+ depends on PROC_FS
+
+config XEN_NETDEV_ACCEL_SFC_UTIL
+ depends on X86
+ tristate
+
+config XEN_BACKEND
+ tristate "Backend driver support"
+ default XEN_PRIVILEGED_GUEST
+ help
+ Support for backend device drivers that provide I/O services
+ to other virtual machines.
+
+config XEN_BLKDEV_BACKEND
+ tristate "Block-device backend driver"
+ depends on XEN_BACKEND
+ default XEN_BACKEND
+ help
+ The block-device backend driver allows the kernel to export its
+ block devices to other guests via a high-performance shared-memory
+ interface.
+
+config XEN_BLKDEV_TAP
+ tristate "Block-device tap backend driver"
+ depends on XEN_BACKEND
+ default XEN_BACKEND
+ help
+ The block tap driver is an alternative to the block back driver
+ and allows VM block requests to be redirected to userspace through
+ a device interface. The tap allows user-space development of
+ high-performance block backends, where disk images may be implemented
+ as files, in memory, or on other hosts across the network. This
+ driver can safely coexist with the existing blockback driver.
+
+config XEN_NETDEV_BACKEND
+ tristate "Network-device backend driver"
+ depends on XEN_BACKEND && NET
+ default XEN_BACKEND
+ help
+ The network-device backend driver allows the kernel to export its
+ network devices to other guests via a high-performance shared-memory
+ interface.
+
+config XEN_NETDEV_PIPELINED_TRANSMITTER
+ bool "Pipelined transmitter (DANGEROUS)"
+ depends on XEN_NETDEV_BACKEND
+ help
+ If the net backend is a dumb domain, such as a transparent Ethernet
+ bridge with no local IP interface, it is safe to say Y here to get
+ slightly lower network overhead.
+ If the backend has a local IP interface; or may be doing smart things
+ like reassembling packets to perform firewall filtering; or if you
+ are unsure; or if you experience network hangs when this option is
+ enabled; then you must say N here.
+
+config XEN_NETDEV_ACCEL_SFC_BACKEND
+ tristate "Network-device backend driver acceleration for Solarflare NICs"
+ depends on XEN_NETDEV_BACKEND && SFC && SFC_RESOURCE && X86
+ select XEN_NETDEV_ACCEL_SFC_UTIL
+ default m
+
+config XEN_NETDEV_LOOPBACK
+ tristate "Network-device loopback driver"
+ depends on XEN_NETDEV_BACKEND
+ help
+ A two-interface loopback device to emulate a local netfront-netback
+ connection. If unsure, it is probably safe to say N here.
+
+config XEN_PCIDEV_BACKEND
+ tristate "PCI-device backend driver"
+ depends on PCI && XEN_BACKEND
+ default XEN_BACKEND
+ help
+ The PCI device backend driver allows the kernel to export arbitrary
+ PCI devices to other guests. If you select this to be a module, you
+ will need to make sure no other driver has bound to the device(s)
+ you want to make visible to other guests.
+
+choice
+ prompt "PCI Backend Mode"
+ depends on XEN_PCIDEV_BACKEND
+ default XEN_PCIDEV_BACKEND_VPCI if !IA64
+ default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
+
+config XEN_PCIDEV_BACKEND_VPCI
+ bool "Virtual PCI"
+ ---help---
+ This PCI Backend hides the true PCI topology and makes the frontend
+ think there is a single PCI bus with only the exported devices on it.
+ For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
+ second device at 02:1a.1 will be re-assigned to 00:01.1.
+
+config XEN_PCIDEV_BACKEND_PASS
+ bool "Passthrough"
+ ---help---
+ This PCI Backend provides a real view of the PCI topology to the
+ frontend (for example, a device at 06:01.b will still appear at
+ 06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
+ PCI devices to its driver domains. This may be required for drivers
+ which depend on finding their hardward in certain bus/slot
+ locations.
+
+config XEN_PCIDEV_BACKEND_SLOT
+ bool "Slot"
+ ---help---
+ This PCI Backend hides the true PCI topology and makes the frontend
+ think there is a single PCI bus with only the exported devices on it.
+ Contrary to the virtual PCI backend, a function becomes a new slot.
+ For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
+ second device at 02:1a.1 will be re-assigned to 00:01.0.
+
+config XEN_PCIDEV_BACKEND_CONTROLLER
+ bool "Controller"
+ depends on IA64
+ ---help---
+ This PCI backend virtualizes the PCI bus topology by providing a
+ virtual bus per PCI root device. Devices which are physically under
+ the same root bus will appear on the same virtual bus. For systems
+ with complex I/O addressing, this is the only backend which supports
+ extended I/O port spaces and MMIO translation offsets. This backend
+ also supports slot virtualization. For example, a device at
+ 0000:01:02.1 will be re-assigned to 0000:00:00.0. A second device
+ at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
+ re-assigned to 0000:00:01.0. A third device at 0000:16:05.0 (under
+ a different PCI root bus) will be re-assigned to 0000:01:00.0.
+
+endchoice
+
+config XEN_PCIDEV_BE_DEBUG
+ bool "PCI Backend Debugging"
+ depends on XEN_PCIDEV_BACKEND
+
+config XEN_TPMDEV_BACKEND
+ tristate "TPM-device backend driver"
+ depends on XEN_BACKEND
+ help
+ The TPM-device backend driver
+
+config XEN_BLKDEV_FRONTEND
+ tristate "Block-device frontend driver"
+ default y
+ help
+ The block-device frontend driver allows the kernel to access block
+ devices mounted within another guest OS. Unless you are building a
+ dedicated device-driver domain, or your master control domain
+ (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_FRONTEND
+ tristate "Network-device frontend driver"
+ depends on NET
+ default y
+ help
+ The network-device frontend driver allows the kernel to access
+ network interfaces within another guest OS. Unless you are building a
+ dedicated device-driver domain, or your master control domain
+ (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_ACCEL_SFC_FRONTEND
+ tristate "Network-device frontend driver acceleration for Solarflare NICs"
+ depends on XEN_NETDEV_FRONTEND && X86
+ select XEN_NETDEV_ACCEL_SFC_UTIL
+ default m
+
+config XEN_GRANT_DEV
+ tristate "User-space granted page access driver"
+ default XEN_PRIVILEGED_GUEST
+ help
+ Device for accessing (in user-space) pages that have been granted
+ by other domains.
+
+config XEN_FRAMEBUFFER
+ tristate "Framebuffer-device frontend driver"
+ depends on FB
+ select FB_CFB_FILLRECT
+ select FB_CFB_COPYAREA
+ select FB_CFB_IMAGEBLIT
+ default y
+ help
+ The framebuffer-device frontend drivers allows the kernel to create a
+ virtual framebuffer. This framebuffer can be viewed in another
+ domain. Unless this domain has access to a real video card, you
+ probably want to say Y here.
+
+config XEN_KEYBOARD
+ tristate "Keyboard-device frontend driver"
+ depends on XEN_FRAMEBUFFER && INPUT
+ default y
+ help
+ The keyboard-device frontend driver allows the kernel to create a
+ virtual keyboard. This keyboard can then be driven by another
+ domain. If you've said Y to CONFIG_XEN_FRAMEBUFFER, you probably
+ want to say Y here.
+
+config XEN_CONSOLE
+ bool "Xen virtual console"
+ default y
+ help
+ The Xen virtual console is ...
+
+config XEN_DISABLE_SERIAL
+ bool "Disable serial port drivers"
+ default y
+ help
+ Disable serial port drivers, allowing the Xen console driver
+ to provide a serial console at ttyS0.
+
+config XEN_SYSFS
+ tristate "Export Xen attributes in sysfs"
+ depends on SYSFS
+ select SYS_HYPERVISOR
+ default y
+ help
+ Xen hypervisor attributes will show up under /sys/hypervisor/.
+
+choice
+ prompt "Xen version compatibility"
+ default XEN_COMPAT_030002_AND_LATER
+
+ config XEN_COMPAT_030002_AND_LATER
+ bool "3.0.2 and later"
+
+ config XEN_COMPAT_030004_AND_LATER
+ bool "3.0.4 and later"
+
+ config XEN_COMPAT_030100_AND_LATER
+ bool "3.1.0 and later"
+
+ config XEN_COMPAT_LATEST_ONLY
+ bool "no compatibility code"
+
+endchoice
+
+config XEN_COMPAT
+ hex
+ default 0xffffff if XEN_COMPAT_LATEST_ONLY
+ default 0x030100 if XEN_COMPAT_030100_AND_LATER
+ default 0x030004 if XEN_COMPAT_030004_AND_LATER
+ default 0x030002 if XEN_COMPAT_030002_AND_LATER
+ default 0
+
+endmenu
+
+config HAVE_IRQ_IGNORE_UNHANDLED
+ def_bool y
+
+config GENERIC_HARDIRQS_NO__DO_IRQ
+ def_bool y
+
+config NO_IDLE_HZ
+ def_bool y
+
+config XEN_SMPBOOT
+ def_bool y
+ depends on SMP && !PPC_XEN
+
+config XEN_XENCOMM
+ bool
+
+config XEN_DEVMEM
+ def_bool y
+
+endif
+
config XEN_BALLOON
- bool "Xen memory balloon driver"
- depends on XEN
+ bool "Xen memory balloon driver" if PARAVIRT_XEN
+ depends on (XEN && !PPC_XEN) || PARAVIRT_XEN
default y
help
The balloon driver allows the Xen domain to request more memory from
return unneeded memory to the system.
config XEN_SCRUB_PAGES
- bool "Scrub pages before returning them to system"
- depends on XEN_BALLOON
+ bool "Scrub memory before freeing it to Xen"
+ depends on XEN || XEN_BALLOON
default y
help
- Scrub pages before returning them to the system for reuse by
- other domains. This makes sure that any confidential data
- is not accidentally visible to other domains. Is it more
- secure, but slightly less efficient.
+ Erase memory contents before freeing it back to Xen's global
+ pool. This ensures that any secrets contained within that
+ memory (e.g., private keys) cannot be found by other guests that
+ may be running on the machine. Most people will want to say Y here.
+ If security is not a concern then you may increase performance by
+ saying N.
If in doubt, say yes.
-obj-y += grant-table.o features.o events.o
-obj-y += xenbus/
-obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
-obj-$(CONFIG_XEN_BALLOON) += balloon.o
+obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
+xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
+xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
+
+xen-balloon-$(CONFIG_XEN) := balloon/
+obj-$(CONFIG_XEN) += core/
+obj-$(CONFIG_XEN) += console/
+obj-$(CONFIG_XEN) += evtchn/
+obj-y += xenbus/
+obj-$(CONFIG_XEN) += char/
+
+obj-$(CONFIG_XEN) += util.o
+obj-$(CONFIG_XEN_XENCOMM) += $(xen-xencomm-y)
+obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
+obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
+obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
+obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmback/
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += blkfront/
+obj-$(CONFIG_XEN_NETDEV_FRONTEND) += netfront/
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront/
+obj-$(CONFIG_XEN_FRAMEBUFFER) += fbfront/
+obj-$(CONFIG_XEN_KEYBOARD) += fbfront/
+obj-$(CONFIG_XEN_PRIVCMD) += privcmd/
+obj-$(CONFIG_XEN_GRANT_DEV) += gntdev/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) += sfc_netutil/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) += sfc_netfront/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) += sfc_netback/
--- /dev/null
+
+obj-y := balloon.o sysfs.o
--- /dev/null
+/******************************************************************************
+ * balloon.c
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <xen/xen_proc.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/interface/memory.h>
+#include <asm/maddr.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <linux/highmem.h>
+#include <linux/list.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *balloon_pde;
+#endif
+
+static DEFINE_MUTEX(balloon_mutex);
+
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(balloon_lock);
+
+struct balloon_stats balloon_stats;
+
+/* We increase/decrease in batches which fit in a page */
+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
+/* VM /proc information for memory */
+extern unsigned long totalram_pages;
+
+#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
+extern unsigned long totalhigh_pages;
+#define inc_totalhigh_pages() (totalhigh_pages++)
+#define dec_totalhigh_pages() (totalhigh_pages--)
+#else
+#define inc_totalhigh_pages() ((void)0)
+#define dec_totalhigh_pages() ((void)0)
+#endif
+extern unsigned long num_physpages;
+
+/* List of ballooned pages, threaded through the mem_map array. */
+static LIST_HEAD(ballooned_pages);
+
+/* Main work function, always executed in process context. */
+static void balloon_process(struct work_struct *unused);
+static DECLARE_WORK(balloon_worker, balloon_process);
+static struct timer_list balloon_timer;
+
+/* When ballooning out (allocating memory to return to Xen) we don't really
+ want the kernel to try too hard since that can trigger the oom killer. */
+#define GFP_BALLOON \
+ (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|__GFP_COLD)
+
+#define PAGE_TO_LIST(p) (&(p)->lru)
+#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
+#define UNLIST_PAGE(p) \
+ do { \
+ list_del(PAGE_TO_LIST(p)); \
+ PAGE_TO_LIST(p)->next = NULL; \
+ PAGE_TO_LIST(p)->prev = NULL; \
+ } while(0)
+
+#define IPRINTK(fmt, args...) \
+ printk(KERN_INFO "xen_mem: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+ printk(KERN_WARNING "xen_mem: " fmt, ##args)
+
+/* balloon_append: add the given page to the balloon. */
+static void balloon_append(struct page *page)
+{
+ /* Lowmem is re-populated first, so highmem pages go at list tail. */
+ if (PageHighMem(page)) {
+ list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
+ bs.balloon_high++;
+ dec_totalhigh_pages();
+ } else {
+ list_add(PAGE_TO_LIST(page), &ballooned_pages);
+ bs.balloon_low++;
+ }
+}
+
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(void)
+{
+ struct page *page;
+
+ if (list_empty(&ballooned_pages))
+ return NULL;
+
+ page = LIST_TO_PAGE(ballooned_pages.next);
+ UNLIST_PAGE(page);
+
+ if (PageHighMem(page)) {
+ bs.balloon_high--;
+ inc_totalhigh_pages();
+ }
+ else
+ bs.balloon_low--;
+
+ return page;
+}
+
+static struct page *balloon_first_page(void)
+{
+ if (list_empty(&ballooned_pages))
+ return NULL;
+ return LIST_TO_PAGE(ballooned_pages.next);
+}
+
+static struct page *balloon_next_page(struct page *page)
+{
+ struct list_head *next = PAGE_TO_LIST(page)->next;
+ if (next == &ballooned_pages)
+ return NULL;
+ return LIST_TO_PAGE(next);
+}
+
+static inline void balloon_free_page(struct page *page)
+{
+#ifndef MODULE
+ if (put_page_testzero(page))
+ free_cold_page(page);
+#else
+ /* free_cold_page() is not being exported. */
+ __free_page(page);
+#endif
+}
+
+static void balloon_alarm(unsigned long unused)
+{
+ schedule_work(&balloon_worker);
+}
+
+static unsigned long current_target(void)
+{
+ unsigned long target = min(bs.target_pages, bs.hard_limit);
+ if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
+ target = bs.current_pages + bs.balloon_low + bs.balloon_high;
+ return target;
+}
+
+static unsigned long minimum_target(void)
+{
+#ifndef CONFIG_XEN
+#define max_pfn num_physpages
+#endif
+ unsigned long min_pages, curr_pages = current_target();
+
+#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+ /* Simple continuous piecewiese linear function:
+ * max MiB -> min MiB gradient
+ * 0 0
+ * 16 16
+ * 32 24
+ * 128 72 (1/2)
+ * 512 168 (1/4)
+ * 2048 360 (1/8)
+ * 8192 552 (1/32)
+ * 32768 1320
+ * 131072 4392
+ */
+ if (max_pfn < MB2PAGES(128))
+ min_pages = MB2PAGES(8) + (max_pfn >> 1);
+ else if (max_pfn < MB2PAGES(512))
+ min_pages = MB2PAGES(40) + (max_pfn >> 2);
+ else if (max_pfn < MB2PAGES(2048))
+ min_pages = MB2PAGES(104) + (max_pfn >> 3);
+ else
+ min_pages = MB2PAGES(296) + (max_pfn >> 5);
+#undef MB2PAGES
+
+ /* Don't enforce growth */
+ return min(min_pages, curr_pages);
+#undef max_pfn
+}
+
+static int increase_reservation(unsigned long nr_pages)
+{
+ unsigned long pfn, i, flags;
+ struct page *page;
+ long rc;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ balloon_lock(flags);
+
+ page = balloon_first_page();
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(page == NULL);
+ frame_list[i] = page_to_pfn(page);;
+ page = balloon_next_page(page);
+ }
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
+ rc = HYPERVISOR_memory_op(
+ XENMEM_populate_physmap, &reservation);
+ if (rc < nr_pages) {
+ if (rc > 0) {
+ int ret;
+
+ /* We hit the Xen hard limit: reprobe. */
+ reservation.nr_extents = rc;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ BUG_ON(ret != rc);
+ }
+ if (rc >= 0)
+ bs.hard_limit = (bs.current_pages + rc -
+ bs.driver_pages);
+ goto out;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ page = balloon_retrieve();
+ BUG_ON(page == NULL);
+
+ pfn = page_to_pfn(page);
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+ phys_to_machine_mapping_valid(pfn));
+
+ set_phys_to_machine(pfn, frame_list[i]);
+
+#ifdef CONFIG_XEN
+ /* Link back into the page tables if not highmem. */
+ if (pfn < max_low_pfn) {
+ int ret;
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte_ma(frame_list[i], PAGE_KERNEL),
+ 0);
+ BUG_ON(ret);
+ }
+#endif
+
+ /* Relinquish the page back to the allocator. */
+ ClearPageReserved(page);
+ init_page_count(page);
+ balloon_free_page(page);
+ }
+
+ bs.current_pages += nr_pages;
+ totalram_pages = bs.current_pages;
+
+ out:
+ balloon_unlock(flags);
+
+ return 0;
+}
+
+static int decrease_reservation(unsigned long nr_pages)
+{
+ unsigned long pfn, i, flags;
+ struct page *page;
+ void *v;
+ int need_sleep = 0;
+ int ret;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ for (i = 0; i < nr_pages; i++) {
+ if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+ nr_pages = i;
+ need_sleep = 1;
+ break;
+ }
+
+ pfn = page_to_pfn(page);
+ frame_list[i] = pfn_to_mfn(pfn);
+
+ if (!PageHighMem(page)) {
+ v = phys_to_virt(pfn << PAGE_SHIFT);
+ scrub_pages(v, 1);
+#ifdef CONFIG_XEN
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)v, __pte_ma(0), 0);
+ BUG_ON(ret);
+#endif
+ }
+#ifdef CONFIG_XEN_SCRUB_PAGES
+ else {
+ v = kmap(page);
+ scrub_pages(v, 1);
+ kunmap(page);
+ }
+#endif
+ }
+
+#ifdef CONFIG_XEN
+ /* Ensure that ballooned highmem pages don't have kmaps. */
+ kmap_flush_unused();
+ flush_tlb_all();
+#endif
+
+ balloon_lock(flags);
+
+ /* No more mappings: invalidate P2M and add to balloon. */
+ for (i = 0; i < nr_pages; i++) {
+ pfn = mfn_to_pfn(frame_list[i]);
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ balloon_append(pfn_to_page(pfn));
+ }
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != nr_pages);
+
+ bs.current_pages -= nr_pages;
+ totalram_pages = bs.current_pages;
+
+ balloon_unlock(flags);
+
+ return need_sleep;
+}
+
+/*
+ * We avoid multiple worker processes conflicting via the balloon mutex.
+ * We may of course race updates of the target counts (which are protected
+ * by the balloon lock), or with changes to the Xen hard limit, but we will
+ * recover from these in time.
+ */
+static void balloon_process(struct work_struct *unused)
+{
+ int need_sleep = 0;
+ long credit;
+
+ mutex_lock(&balloon_mutex);
+
+ do {
+ credit = current_target() - bs.current_pages;
+ if (credit > 0)
+ need_sleep = (increase_reservation(credit) != 0);
+ if (credit < 0)
+ need_sleep = (decrease_reservation(-credit) != 0);
+
+#ifndef CONFIG_PREEMPT
+ if (need_resched())
+ schedule();
+#endif
+ } while ((credit != 0) && !need_sleep);
+
+ /* Schedule more work if there is some still to be done. */
+ if (current_target() != bs.current_pages)
+ mod_timer(&balloon_timer, jiffies + HZ);
+
+ mutex_unlock(&balloon_mutex);
+}
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
+{
+ /* No need for lock. Not read-modify-write updates. */
+ bs.hard_limit = ~0UL;
+ bs.target_pages = max(target, minimum_target());
+ schedule_work(&balloon_worker);
+}
+
+static struct xenbus_watch target_watch =
+{
+ .node = "memory/target"
+};
+
+/* React to a change in the target key */
+static void watch_target(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ unsigned long long new_target;
+ int err;
+
+ err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
+ if (err != 1) {
+ /* This is ok (for domain0 at least) - so just return */
+ return;
+ }
+
+ /* The given memory/target value is in KiB, so it needs converting to
+ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+ */
+ balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+}
+
+static int balloon_init_watcher(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ int err;
+
+ err = register_xenbus_watch(&target_watch);
+ if (err)
+ printk(KERN_ERR "Failed to set balloon watcher\n");
+
+ return NOTIFY_DONE;
+}
+
+#ifdef CONFIG_PROC_FS
+static int balloon_write(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ char memstring[64], *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (count <= 1)
+ return -EBADMSG; /* runt */
+ if (count > sizeof(memstring))
+ return -EFBIG; /* too long */
+
+ if (copy_from_user(memstring, buffer, count))
+ return -EFAULT;
+ memstring[sizeof(memstring)-1] = '\0';
+
+ target_bytes = memparse(memstring, &endchar);
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static int balloon_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len;
+
+ len = sprintf(
+ page,
+ "Current allocation: %8lu kB\n"
+ "Requested target: %8lu kB\n"
+ "Minimum target: %8lu kB\n"
+ "Maximum target: %8lu kB\n"
+ "Low-mem balloon: %8lu kB\n"
+ "High-mem balloon: %8lu kB\n"
+ "Driver pages: %8lu kB\n"
+ "Xen hard limit: ",
+ PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages),
+ PAGES2KB(minimum_target()), PAGES2KB(num_physpages),
+ PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
+ PAGES2KB(bs.driver_pages));
+
+ if (bs.hard_limit != ~0UL)
+ len += sprintf(page + len, "%8lu kB\n",
+ PAGES2KB(bs.hard_limit));
+ else
+ len += sprintf(page + len, " ??? kB\n");
+
+ *eof = 1;
+ return len;
+}
+#endif
+
+static struct notifier_block xenstore_notifier;
+
+static int __init balloon_init(void)
+{
+#if defined(CONFIG_X86) && defined(CONFIG_XEN)
+ unsigned long pfn;
+ struct page *page;
+#endif
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ IPRINTK("Initialising balloon driver.\n");
+
+#ifdef CONFIG_XEN
+ bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
+ totalram_pages = bs.current_pages;
+#else
+ bs.current_pages = totalram_pages;
+#endif
+ bs.target_pages = bs.current_pages;
+ bs.balloon_low = 0;
+ bs.balloon_high = 0;
+ bs.driver_pages = 0UL;
+ bs.hard_limit = ~0UL;
+
+ init_timer(&balloon_timer);
+ balloon_timer.data = 0;
+ balloon_timer.function = balloon_alarm;
+
+#ifdef CONFIG_PROC_FS
+ if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
+ WPRINTK("Unable to create /proc/xen/balloon.\n");
+ return -1;
+ }
+
+ balloon_pde->read_proc = balloon_read;
+ balloon_pde->write_proc = balloon_write;
+#endif
+ balloon_sysfs_init();
+
+#if defined(CONFIG_X86) && defined(CONFIG_XEN)
+ /* Initialise the balloon with excess memory space. */
+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ balloon_append(page);
+ }
+#endif
+
+ target_watch.callback = watch_target;
+ xenstore_notifier.notifier_call = balloon_init_watcher;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+
+subsys_initcall(balloon_init);
+
+static void __exit balloon_exit(void)
+{
+ /* XXX - release balloon here */
+ return;
+}
+
+module_exit(balloon_exit);
+
+void balloon_update_driver_allowance(long delta)
+{
+ unsigned long flags;
+
+ balloon_lock(flags);
+ bs.driver_pages += delta;
+ balloon_unlock(flags);
+}
+
+#ifdef CONFIG_XEN
+static int dealloc_pte_fn(
+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+ unsigned long mfn = pte_mfn(*pte);
+ int ret;
+ struct xen_memory_reservation reservation = {
+ .nr_extents = 1,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ set_pte_at(&init_mm, addr, pte, __pte_ma(0));
+ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != 1);
+ return 0;
+}
+#endif
+
+struct page **alloc_empty_pages_and_pagevec(int nr_pages)
+{
+ unsigned long flags;
+ void *v;
+ struct page *page, **pagevec;
+ int i, ret;
+
+ pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
+ if (pagevec == NULL)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_COLD);
+ if (page == NULL)
+ goto err;
+
+ v = page_address(page);
+ scrub_pages(v, 1);
+
+ balloon_lock(flags);
+
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ unsigned long gmfn = page_to_pfn(page);
+ struct xen_memory_reservation reservation = {
+ .nr_extents = 1,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ set_xen_guest_handle(reservation.extent_start, &gmfn);
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ if (ret == 1)
+ ret = 0; /* success */
+ } else {
+#ifdef CONFIG_XEN
+ ret = apply_to_page_range(&init_mm, (unsigned long)v,
+ PAGE_SIZE, dealloc_pte_fn,
+ NULL);
+#else
+ /* Cannot handle non-auto translate mode. */
+ ret = 1;
+#endif
+ }
+
+ if (ret != 0) {
+ balloon_unlock(flags);
+ balloon_free_page(page);
+ goto err;
+ }
+
+ totalram_pages = --bs.current_pages;
+
+ balloon_unlock(flags);
+ }
+
+ out:
+ schedule_work(&balloon_worker);
+#ifdef CONFIG_XEN
+ flush_tlb_all();
+#endif
+ return pagevec;
+
+ err:
+ balloon_lock(flags);
+ while (--i >= 0)
+ balloon_append(pagevec[i]);
+ balloon_unlock(flags);
+ kfree(pagevec);
+ pagevec = NULL;
+ goto out;
+}
+
+static void _free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages, int free_vec)
+{
+ unsigned long flags;
+ int i;
+
+ if (pagevec == NULL)
+ return;
+
+ balloon_lock(flags);
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(page_count(pagevec[i]) != 1);
+ balloon_append(pagevec[i]);
+ }
+ balloon_unlock(flags);
+
+ if (free_vec)
+ kfree(pagevec);
+ else
+ totalram_pages = bs.current_pages -= nr_pages;
+
+ schedule_work(&balloon_worker);
+}
+
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
+{
+ _free_empty_pages_and_pagevec(pagevec, nr_pages, 1);
+}
+
+void free_empty_pages(struct page **pagevec, int nr_pages)
+{
+ _free_empty_pages_and_pagevec(pagevec, nr_pages, 0);
+}
+
+void balloon_release_driver_page(struct page *page)
+{
+ unsigned long flags;
+
+ balloon_lock(flags);
+ balloon_append(page);
+ bs.driver_pages--;
+ balloon_unlock(flags);
+
+ schedule_work(&balloon_worker);
+}
+
+EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
+EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
+EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
+EXPORT_SYMBOL_GPL(balloon_release_driver_page);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/******************************************************************************
+ * balloon/common.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BALLOON_COMMON_H__
+#define __XEN_BALLOON_COMMON_H__
+
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+
+struct balloon_stats {
+ /* We aim for 'current allocation' == 'target allocation'. */
+ unsigned long current_pages;
+ unsigned long target_pages;
+ /* We may hit the hard limit in Xen. If we do then we remember it. */
+ unsigned long hard_limit;
+ /*
+ * Drivers may alter the memory reservation independently, but they
+ * must inform the balloon driver so we avoid hitting the hard limit.
+ */
+ unsigned long driver_pages;
+ /* Number of pages in high- and low-memory balloons. */
+ unsigned long balloon_low;
+ unsigned long balloon_high;
+};
+
+extern struct balloon_stats balloon_stats;
+#define bs balloon_stats
+
+int balloon_sysfs_init(void);
+void balloon_sysfs_exit(void);
+
+void balloon_set_new_target(unsigned long target);
+
+#endif /* __XEN_BALLOON_COMMON_H__ */
--- /dev/null
+/******************************************************************************
+ * balloon/sysfs.c
+ *
+ * Xen balloon driver - sysfs interfaces.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include "common.h"
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BALLOON_CLASS_NAME "xen_memory"
+
+#define BALLOON_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct sys_device *dev, \
+ char *buf) \
+ { \
+ return sprintf(buf, format, ##args); \
+ } \
+ static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+
+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
+BALLOON_SHOW(hard_limit_kb,
+ (bs.hard_limit!=~0UL) ? "%lu\n" : "???\n",
+ (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
+BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
+
+static ssize_t show_target_kb(struct sys_device *dev, char *buf)
+{
+ return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
+}
+
+static ssize_t store_target_kb(struct sys_device *dev,
+ const char *buf,
+ size_t count)
+{
+ char memstring[64], *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (count <= 1)
+ return -EBADMSG; /* runt */
+ if (count > sizeof(memstring))
+ return -EFBIG; /* too long */
+ strcpy(memstring, buf);
+
+ target_bytes = memparse(memstring, &endchar);
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
+ show_target_kb, store_target_kb);
+
+static struct sysdev_attribute *balloon_attrs[] = {
+ &attr_target_kb,
+};
+
+static struct attribute *balloon_info_attrs[] = {
+ &attr_current_kb.attr,
+ &attr_low_kb.attr,
+ &attr_high_kb.attr,
+ &attr_hard_limit_kb.attr,
+ &attr_driver_kb.attr,
+ NULL
+};
+
+static struct attribute_group balloon_info_group = {
+ .name = "info",
+ .attrs = balloon_info_attrs,
+};
+
+static struct sysdev_class balloon_sysdev_class = {
+ .name = BALLOON_CLASS_NAME,
+};
+
+static struct sys_device balloon_sysdev;
+
+static int register_balloon(struct sys_device *sysdev)
+{
+ int i, error;
+
+ error = sysdev_class_register(&balloon_sysdev_class);
+ if (error)
+ return error;
+
+ sysdev->id = 0;
+ sysdev->cls = &balloon_sysdev_class;
+
+ error = sysdev_register(sysdev);
+ if (error) {
+ sysdev_class_unregister(&balloon_sysdev_class);
+ return error;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
+ error = sysdev_create_file(sysdev, balloon_attrs[i]);
+ if (error)
+ goto fail;
+ }
+
+ error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
+ if (error)
+ goto fail;
+
+ return 0;
+
+ fail:
+ while (--i >= 0)
+ sysdev_remove_file(sysdev, balloon_attrs[i]);
+ sysdev_unregister(sysdev);
+ sysdev_class_unregister(&balloon_sysdev_class);
+ return error;
+}
+
+static void unregister_balloon(struct sys_device *sysdev)
+{
+ int i;
+
+ sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
+ sysdev_remove_file(sysdev, balloon_attrs[i]);
+ sysdev_unregister(sysdev);
+ sysdev_class_unregister(&balloon_sysdev_class);
+}
+
+int balloon_sysfs_init(void)
+{
+ return register_balloon(&balloon_sysdev);
+}
+
+void balloon_sysfs_exit(void)
+{
+ unregister_balloon(&balloon_sysdev);
+}
--- /dev/null
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
+
+blkbk-y := blkback.o xenbus.o interface.o vbd.o cdrom.o
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * arch/xen/drivers/blkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <xen/balloon.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ *
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+ blkif_t *blkif;
+ u64 id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+ unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
+ return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+ (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+ pending_req_t *req = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+ if (!list_empty(&pending_free)) {
+ req = list_entry(pending_free.next, pending_req_t, free_list);
+ list_del(&req->free_list);
+ }
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+ unsigned long flags;
+ int was_empty;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+ was_empty = list_empty(&pending_free);
+ list_add(&req->free_list, &pending_free);
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ if (was_empty)
+ wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+ if (blkif->plug == NULL)
+ return;
+ if (blkif->plug->unplug_fn)
+ blkif->plug->unplug_fn(blkif->plug);
+ kobject_get(&blkif->plug->kobj);
+ blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+ if (q == blkif->plug)
+ return;
+ unplug_queue(blkif);
+ WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
+ kobject_get(&q->kobj);
+ blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int i, invcount = 0;
+ grant_handle_t handle;
+ int ret;
+
+ for (i = 0; i < req->nr_pages; i++) {
+ handle = pending_handle(req, i);
+ if (handle == BLKBACK_INVALID_HANDLE)
+ continue;
+ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+ GNTMAP_host_map, handle);
+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+ invcount++;
+ }
+
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, unmap, invcount);
+ BUG_ON(ret);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
+ current->comm, blkif->st_oo_req,
+ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+ blkif->st_rd_req = 0;
+ blkif->st_wr_req = 0;
+ blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+ blkif_t *blkif = arg;
+
+ blkif_get(blkif);
+
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: started\n", current->comm);
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
+ wait_event_interruptible(
+ blkif->wq,
+ blkif->waiting_reqs || kthread_should_stop());
+ wait_event_interruptible(
+ pending_free_wq,
+ !list_empty(&pending_free) || kthread_should_stop());
+
+ blkif->waiting_reqs = 0;
+ smp_mb(); /* clear flag *before* checking for work */
+
+ if (do_block_io_op(blkif))
+ blkif->waiting_reqs = 1;
+ unplug_queue(blkif);
+
+ if (log_stats && time_after(jiffies, blkif->st_print))
+ print_stats(blkif);
+ }
+
+ if (log_stats)
+ print_stats(blkif);
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+ blkif->xenblkd = NULL;
+ blkif_put(blkif);
+
+ return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+ /* An error fails the entire request. */
+ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+ (error == -EOPNOTSUPP)) {
+ DPRINTK("blkback: write barrier op failed, not supported\n");
+ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+ } else if (error) {
+ DPRINTK("Buffer not up-to-date at end of operation, "
+ "error=%d\n", error);
+ pending_req->status = BLKIF_RSP_ERROR;
+ }
+
+ if (atomic_dec_and_test(&pending_req->pendcnt)) {
+ fast_flush_area(pending_req);
+ make_response(pending_req->blkif, pending_req->id,
+ pending_req->operation, pending_req->status);
+ blkif_put(pending_req->blkif);
+ free_req(pending_req);
+ }
+}
+
+static void end_block_io_op(struct bio *bio, int error)
+{
+ __end_block_io_op(bio->bi_private, error);
+ bio_put(bio);
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+ blkif->waiting_reqs = 1;
+ wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+ blkif_notify_work(dev_id);
+ return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif)
+{
+ blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+ blkif_request_t req;
+ pending_req_t *pending_req;
+ RING_IDX rc, rp;
+ int more_to_do = 0;
+
+ rc = blk_rings->common.req_cons;
+ rp = blk_rings->common.sring->req_prod;
+ rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ while (rc != rp) {
+
+ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+ break;
+
+ pending_req = alloc_req();
+ if (NULL == pending_req) {
+ blkif->st_oo_req++;
+ more_to_do = 1;
+ break;
+ }
+
+ if (kthread_should_stop()) {
+ more_to_do = 1;
+ break;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+ switch (req.operation) {
+ case BLKIF_OP_READ:
+ blkif->st_rd_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+ case BLKIF_OP_WRITE_BARRIER:
+ blkif->st_br_req++;
+ /* fall through */
+ case BLKIF_OP_WRITE:
+ blkif->st_wr_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+ default:
+ /* A good sign something is wrong: sleep for a while to
+ * avoid excessive CPU consumption by a bad guest. */
+ msleep(1);
+ DPRINTK("error: unknown block io operation [%d]\n",
+ req.operation);
+ make_response(blkif, req.id, req.operation,
+ BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ break;
+ }
+
+ /* Yield point for this unbounded loop. */
+ cond_resched();
+ }
+
+ return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req)
+{
+ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct phys_req preq;
+ struct {
+ unsigned long buf; unsigned int nsec;
+ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int nseg;
+ struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ int ret, i, nbio = 0;
+ int operation;
+
+ switch (req->operation) {
+ case BLKIF_OP_READ:
+ operation = READ;
+ break;
+ case BLKIF_OP_WRITE:
+ operation = WRITE;
+ break;
+ case BLKIF_OP_WRITE_BARRIER:
+ operation = WRITE_BARRIER;
+ break;
+ default:
+ operation = 0; /* make gcc happy */
+ BUG();
+ }
+
+ /* Check that number of segments is sane. */
+ nseg = req->nr_segments;
+ if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ DPRINTK("Bad number of segments in request (%d)\n", nseg);
+ goto fail_response;
+ }
+
+ preq.dev = req->handle;
+ preq.sector_number = req->sector_number;
+ preq.nr_sects = 0;
+
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = req->operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+
+ for (i = 0; i < nseg; i++) {
+ uint32_t flags;
+
+ seg[i].nsec = req->seg[i].last_sect -
+ req->seg[i].first_sect + 1;
+
+ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (req->seg[i].last_sect < req->seg[i].first_sect))
+ goto fail_response;
+ preq.nr_sects += seg[i].nsec;
+
+ flags = GNTMAP_host_map;
+ if (operation != READ)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+ req->seg[i].gref, blkif->domid);
+ }
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+ BUG_ON(ret);
+
+ for (i = 0; i < nseg; i++) {
+ if (unlikely(map[i].status != 0)) {
+ DPRINTK("invalid buffer -- could not remap it\n");
+ map[i].handle = BLKBACK_INVALID_HANDLE;
+ ret |= 1;
+ }
+
+ pending_handle(pending_req, i) = map[i].handle;
+
+ if (ret)
+ continue;
+
+ set_phys_to_machine(__pa(vaddr(
+ pending_req, i)) >> PAGE_SHIFT,
+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+ seg[i].buf = map[i].dev_bus_addr |
+ (req->seg[i].first_sect << 9);
+ }
+
+ if (ret)
+ goto fail_flush;
+
+ if (vbd_translate(&preq, blkif, operation) != 0) {
+ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+ operation == READ ? "read" : "write",
+ preq.sector_number,
+ preq.sector_number + preq.nr_sects, preq.dev);
+ goto fail_flush;
+ }
+
+ for (i = 0; i < nseg; i++) {
+ if (((int)preq.sector_number|(int)seg[i].nsec) &
+ ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
+ DPRINTK("Misaligned I/O request from domain %d",
+ blkif->domid);
+ goto fail_put_bio;
+ }
+
+ while ((bio == NULL) ||
+ (bio_add_page(bio,
+ virt_to_page(vaddr(pending_req, i)),
+ seg[i].nsec << 9,
+ seg[i].buf & ~PAGE_MASK) == 0)) {
+ bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ bio->bi_sector = preq.sector_number;
+ }
+
+ preq.sector_number += seg[i].nsec;
+ }
+
+ if (!bio) {
+ BUG_ON(operation != WRITE_BARRIER);
+ bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, 0);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ bio->bi_sector = -1;
+ }
+
+ plug_queue(blkif, bio);
+ atomic_set(&pending_req->pendcnt, nbio);
+ blkif_get(blkif);
+
+ for (i = 0; i < nbio; i++)
+ submit_bio(operation, biolist[i]);
+
+ if (operation == READ)
+ blkif->st_rd_sect += preq.nr_sects;
+ else if (operation == WRITE || operation == WRITE_BARRIER)
+ blkif->st_wr_sect += preq.nr_sects;
+
+ return;
+
+ fail_put_bio:
+ for (i = 0; i < (nbio-1); i++)
+ bio_put(biolist[i]);
+ fail_flush:
+ fast_flush_area(pending_req);
+ fail_response:
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ msleep(1); /* back off a bit */
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st)
+{
+ blkif_response_t resp;
+ unsigned long flags;
+ blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+ int more_to_do = 0;
+ int notify;
+
+ resp.id = id;
+ resp.operation = op;
+ resp.status = st;
+
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ /* Place on the response ring for the relevant domain. */
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.rsp_prod_pvt++;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+ /*
+ * Tail check for pending requests. Allows frontend to avoid
+ * notifications if requests are already in flight (lower
+ * overheads and promotes batching).
+ */
+ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+ more_to_do = 1;
+ }
+
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+ if (more_to_do)
+ blkif_notify_work(blkif);
+ if (notify)
+ notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+ int i, mmap_pages;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+ pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
+ blkif_reqs, GFP_KERNEL);
+ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+ mmap_pages, GFP_KERNEL);
+ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
+
+ if (!pending_reqs || !pending_grant_handles || !pending_pages)
+ goto out_of_memory;
+
+ for (i = 0; i < mmap_pages; i++)
+ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+
+ blkif_interface_init();
+
+ memset(pending_reqs, 0, sizeof(pending_reqs));
+ INIT_LIST_HEAD(&pending_free);
+
+ for (i = 0; i < blkif_reqs; i++)
+ list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+ blkif_xenbus_init();
+
+ return 0;
+
+ out_of_memory:
+ kfree(pending_reqs);
+ kfree(pending_grant_handles);
+ free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+ printk("%s: out of memory\n", __FUNCTION__);
+ return -ENOMEM;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/******************************************************************************
+ * blkback/cdrom.c
+ *
+ * Routines for managing cdrom watch and media-present attribute of a
+ * cdrom type virtual block device (VBD).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ * Copyright (c) 2007 Pat Campbell
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#undef DPRINTK
+#define DPRINTK(_f, _a...) \
+ printk("(%s() file=%s, line=%d) " _f "\n", \
+ __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
+
+
+#define MEDIA_PRESENT "media-present"
+
+static void cdrom_media_changed(struct xenbus_watch *, const char **, unsigned int);
+
+/**
+ * Writes media-present=1 attribute for the given vbd device if not
+ * already there
+ */
+static int cdrom_xenstore_write_media_present(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ struct xenbus_transaction xbt;
+ int err;
+ int media_present;
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
+ &media_present);
+ if (0 < err) {
+ DPRINTK("already written err%d", err);
+ return(0);
+ }
+ media_present = 1;
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ return(-1);
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, MEDIA_PRESENT, "%d", media_present );
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/%s",
+ dev->nodename, MEDIA_PRESENT);
+ goto abort;
+ }
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ xenbus_dev_fatal(dev, err, "ending transaction");
+ return 0;
+ abort:
+ xenbus_transaction_end(xbt, 1);
+ return -1;
+}
+
+/**
+ *
+ */
+static int cdrom_is_type(struct backend_info *be)
+{
+ DPRINTK("type:%x", be->blkif->vbd.type );
+ return (be->blkif->vbd.type & VDISK_CDROM)
+ && (be->blkif->vbd.type & GENHD_FL_REMOVABLE);
+}
+
+/**
+ *
+ */
+void cdrom_add_media_watch(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ int err;
+
+ DPRINTK("nodename:%s", dev->nodename);
+ if (cdrom_is_type(be)) {
+ DPRINTK("is a cdrom");
+ if ( cdrom_xenstore_write_media_present(be) == 0 ) {
+ DPRINTK( "xenstore wrote OK");
+ err = xenbus_watch_path2(dev, dev->nodename, MEDIA_PRESENT,
+ &be->backend_cdrom_watch,
+ cdrom_media_changed);
+ if (err)
+ DPRINTK( "media_present watch add failed" );
+ }
+ }
+}
+
+/**
+ * Callback received when the "media_present" xenstore node is changed
+ */
+static void cdrom_media_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int err;
+ unsigned media_present;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_cdrom_watch);
+ struct xenbus_device *dev = be->dev;
+
+ if (!cdrom_is_type(be)) {
+ DPRINTK("callback not for a cdrom" );
+ return;
+ }
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
+ &media_present);
+ if (err == 0 || err == -ENOENT) {
+ DPRINTK("xenbus_read of cdrom media_present node error:%d",err);
+ return;
+ }
+
+ if (media_present == 0)
+ vbd_free(&be->blkif->vbd);
+ else {
+ char *p = strrchr(dev->otherend, '/') + 1;
+ long handle = simple_strtoul(p, NULL, 0);
+
+ if (!be->blkif->vbd.bdev) {
+ err = vbd_create(be->blkif, handle, be->major, be->minor,
+ !strchr(be->mode, 'w'), 1);
+ if (err) {
+ be->major = be->minor = 0;
+ xenbus_dev_fatal(dev, err, "creating vbd structure");
+ return;
+ }
+ }
+ }
+}
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/gnttab.h>
+#include <xen/driver_util.h>
+#include <xen/xenbus.h>
+
+#define DPRINTK(_f, _a...) \
+ pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+
+struct vbd {
+ blkif_vdev_t handle; /* what the domain refers to this vbd as */
+ unsigned char readonly; /* Non-zero -> read-only */
+ unsigned char type; /* VDISK_xxx */
+ u32 pdevice; /* phys device that this vbd maps to */
+ struct block_device *bdev;
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned int irq;
+ /* Comms information. */
+ enum blkif_protocol blk_protocol;
+ blkif_back_rings_t blk_rings;
+ struct vm_struct *blk_ring_area;
+ /* The VBD attached to this interface. */
+ struct vbd vbd;
+ /* Back pointer to the backend_info. */
+ struct backend_info *be;
+ /* Private fields. */
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+
+ wait_queue_head_t wq;
+ struct task_struct *xenblkd;
+ unsigned int waiting_reqs;
+ struct request_queue *plug;
+
+ /* statistics */
+ unsigned long st_print;
+ int st_rd_req;
+ int st_wr_req;
+ int st_oo_req;
+ int st_br_req;
+ int st_rd_sect;
+ int st_wr_sect;
+
+ wait_queue_head_t waiting_to_free;
+
+ grant_handle_t shmem_handle;
+ grant_ref_t shmem_ref;
+} blkif_t;
+
+struct backend_info
+{
+ struct xenbus_device *dev;
+ blkif_t *blkif;
+ struct xenbus_watch backend_watch;
+ struct xenbus_watch backend_cdrom_watch;
+ unsigned major;
+ unsigned minor;
+ char *mode;
+};
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b) \
+ do { \
+ if (atomic_dec_and_test(&(_b)->refcnt)) \
+ wake_up(&(_b)->waiting_to_free);\
+ } while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+ unsigned minor, int readonly, int cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+ unsigned short dev;
+ unsigned short nr_sects;
+ struct block_device *bdev;
+ blkif_sector_t sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+void blkif_interface_init(void);
+
+void blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id);
+int blkif_schedule(void *arg);
+
+int blkback_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state);
+
+/* cdrom media change */
+void cdrom_add_media_watch(struct backend_info *be);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+#include <linux/kthread.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+ blkif_t *blkif;
+
+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+ if (!blkif)
+ return ERR_PTR(-ENOMEM);
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 1);
+ init_waitqueue_head(&blkif->wq);
+ blkif->st_print = jiffies;
+ init_waitqueue_head(&blkif->waiting_to_free);
+
+ return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, shared_page, blkif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Grant table operation failure !\n");
+ return op.status;
+ }
+
+ blkif->shmem_ref = shared_page;
+ blkif->shmem_handle = op.handle;
+
+ return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, blkif->shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+ int err;
+
+ /* Already connected through? */
+ if (blkif->irq)
+ return 0;
+
+ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+ return -ENOMEM;
+
+ err = map_frontend_page(blkif, shared_page);
+ if (err) {
+ free_vm_area(blkif->blk_ring_area);
+ return err;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ blkif_sring_t *sring;
+ sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ blkif_x86_32_sring_t *sring_x86_32;
+ sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ blkif_x86_64_sring_t *sring_x86_64;
+ sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+ break;
+ }
+ default:
+ BUG();
+ }
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+ if (err < 0)
+ {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ return err;
+ }
+ blkif->irq = err;
+
+ return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+ if (blkif->xenblkd) {
+ kthread_stop(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ }
+
+ atomic_dec(&blkif->refcnt);
+ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+ atomic_inc(&blkif->refcnt);
+
+ if (blkif->irq) {
+ unbind_from_irqhandler(blkif->irq, blkif);
+ blkif->irq = 0;
+ }
+
+ if (blkif->blk_rings.common.sring) {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ }
+}
+
+void blkif_free(blkif_t *blkif)
+{
+ if (!atomic_dec_and_test(&blkif->refcnt))
+ BUG();
+ kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init blkif_interface_init(void)
+{
+ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+ 0, 0, NULL);
+}
--- /dev/null
+/******************************************************************************
+ * blkback/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
+ (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+ return vbd_sz(vbd);
+}
+
+unsigned int vbd_info(struct vbd *vbd)
+{
+ return vbd->type | (vbd->readonly?VDISK_READONLY:0);
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+ return bdev_hardsect_size(vbd->bdev);
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+ unsigned minor, int readonly, int cdrom)
+{
+ struct vbd *vbd;
+ struct block_device *bdev;
+
+ vbd = &blkif->vbd;
+ vbd->handle = handle;
+ vbd->readonly = readonly;
+ vbd->type = 0;
+
+ vbd->pdevice = MKDEV(major, minor);
+
+ bdev = open_by_devnum(vbd->pdevice,
+ vbd->readonly ? FMODE_READ : FMODE_WRITE);
+
+ if (IS_ERR(bdev)) {
+ DPRINTK("vbd_creat: device %08x could not be opened.\n",
+ vbd->pdevice);
+ return -ENOENT;
+ }
+
+ vbd->bdev = bdev;
+
+ if (vbd->bdev->bd_disk == NULL) {
+ DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+ vbd->pdevice);
+ vbd_free(vbd);
+ return -ENOENT;
+ }
+
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+ vbd->type |= VDISK_CDROM;
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+ vbd->type |= VDISK_REMOVABLE;
+
+ DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+ handle, blkif->domid);
+ return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+ if (vbd->bdev)
+ blkdev_put(vbd->bdev);
+ vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+ struct vbd *vbd = &blkif->vbd;
+ int rc = -EACCES;
+
+ if ((operation != READ) && vbd->readonly)
+ goto out;
+
+ if (vbd->bdev == NULL)
+ goto out;
+
+ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
+ goto out;
+
+ req->dev = vbd->pdevice;
+ req->bdev = vbd->bdev;
+ rc = 0;
+
+ out:
+ return rc;
+}
--- /dev/null
+/* Xenbus code for blkif backend
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ Copyright (C) 2005 XenSource Ltd
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include "common.h"
+#include "../core/domctl.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
+ __FUNCTION__, __LINE__, ##args)
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+ unsigned int);
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+ char *devpath, *devname;
+ struct xenbus_device *dev = blkif->be->dev;
+
+ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+ if (IS_ERR(devpath))
+ return PTR_ERR(devpath);
+
+ if ((devname = strstr(devpath, "/dev/")) != NULL)
+ devname += strlen("/dev/");
+ else
+ devname = devpath;
+
+ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+ kfree(devpath);
+
+ return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{
+ int err;
+ char name[TASK_COMM_LEN];
+
+ /* Not ready to connect? */
+ if (!blkif->irq || !blkif->vbd.bdev)
+ return;
+
+ /* Already connected? */
+ if (blkif->be->dev->state == XenbusStateConnected)
+ return;
+
+ /* Attempt to connect: exit if we fail to. */
+ connect(blkif->be);
+ if (blkif->be->dev->state != XenbusStateConnected)
+ return;
+
+ err = blkback_name(blkif, name);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+ return;
+ }
+
+ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+ if (IS_ERR(blkif->xenblkd)) {
+ err = PTR_ERR(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+ }
+}
+
+
+/****************************************************************
+ * sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *_dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ struct xenbus_device *dev = to_xenbus_device(_dev); \
+ struct backend_info *be = dev->dev.driver_data; \
+ \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+ &dev_attr_oo_req.attr,
+ &dev_attr_rd_req.attr,
+ &dev_attr_wr_req.attr,
+ &dev_attr_br_req.attr,
+ &dev_attr_rd_sect.attr,
+ &dev_attr_wr_sect.attr,
+ NULL
+};
+
+static struct attribute_group vbdstat_group = {
+ .name = "statistics",
+ .attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+ int error;
+
+ error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ if (error)
+ goto fail1;
+
+ error = device_create_file(&dev->dev, &dev_attr_mode);
+ if (error)
+ goto fail2;
+
+ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+ if (error)
+ goto fail3;
+
+ return 0;
+
+fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2: device_remove_file(&dev->dev, &dev_attr_mode);
+fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
+ return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+ device_remove_file(&dev->dev, &dev_attr_mode);
+ device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev->dev.driver_data;
+
+ DPRINTK("");
+
+ if (be->major || be->minor)
+ xenvbd_sysfs_delif(dev);
+
+ if (be->backend_watch.node) {
+ unregister_xenbus_watch(&be->backend_watch);
+ kfree(be->backend_watch.node);
+ be->backend_watch.node = NULL;
+ }
+
+ if (be->backend_cdrom_watch.node) {
+ unregister_xenbus_watch(&be->backend_cdrom_watch);
+ kfree(be->backend_cdrom_watch.node);
+ be->backend_cdrom_watch.node = NULL;
+ }
+
+ if (be->blkif) {
+ blkif_disconnect(be->blkif);
+ vbd_free(&be->blkif->vbd);
+ blkif_free(be->blkif);
+ be->blkif = NULL;
+ }
+
+ kfree(be);
+ dev->dev.driver_data = NULL;
+ return 0;
+}
+
+int blkback_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state)
+{
+ struct xenbus_device *dev = be->dev;
+ int err;
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+ "%d", state);
+ if (err)
+ xenbus_dev_fatal(dev, err, "writing feature-barrier");
+
+ return err;
+}
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers. Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+ be->dev = dev;
+ dev->dev.driver_data = be;
+
+ be->blkif = blkif_alloc(dev->otherend_id);
+ if (IS_ERR(be->blkif)) {
+ err = PTR_ERR(be->blkif);
+ be->blkif = NULL;
+ xenbus_dev_fatal(dev, err, "creating block interface");
+ goto fail;
+ }
+
+ /* setup back pointer */
+ be->blkif->be = be;
+
+ err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
+ &be->backend_watch, backend_changed);
+ if (err)
+ goto fail;
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ DPRINTK("failed");
+ blkback_remove(dev);
+ return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node. Read it and the mode node, and create a vbd. If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int err;
+ unsigned major;
+ unsigned minor;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_watch);
+ struct xenbus_device *dev = be->dev;
+ int cdrom = 0;
+ char *device_type;
+
+ DPRINTK("");
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+ &major, &minor);
+ if (XENBUS_EXIST_ERR(err)) {
+ /* Since this watch will fire once immediately after it is
+ registered, we expect this. Ignore it, and wait for the
+ hotplug scripts. */
+ return;
+ }
+ if (err != 2) {
+ xenbus_dev_fatal(dev, err, "reading physical-device");
+ return;
+ }
+
+ if ((be->major || be->minor) &&
+ ((be->major != major) || (be->minor != minor))) {
+ printk(KERN_WARNING
+ "blkback: changing physical device (from %x:%x to "
+ "%x:%x) not supported.\n", be->major, be->minor,
+ major, minor);
+ return;
+ }
+
+ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+ if (IS_ERR(be->mode)) {
+ err = PTR_ERR(be->mode);
+ be->mode = NULL;
+ xenbus_dev_fatal(dev, err, "reading mode");
+ return;
+ }
+
+ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+ if (!IS_ERR(device_type)) {
+ cdrom = strcmp(device_type, "cdrom") == 0;
+ kfree(device_type);
+ }
+
+ if (be->major == 0 && be->minor == 0) {
+ /* Front end dir is a number, which is used as the handle. */
+
+ char *p = strrchr(dev->otherend, '/') + 1;
+ long handle = simple_strtoul(p, NULL, 0);
+
+ be->major = major;
+ be->minor = minor;
+
+ err = vbd_create(be->blkif, handle, major, minor,
+ (NULL == strchr(be->mode, 'w')), cdrom);
+ if (err) {
+ be->major = be->minor = 0;
+ xenbus_dev_fatal(dev, err, "creating vbd structure");
+ return;
+ }
+
+ err = xenvbd_sysfs_addif(dev);
+ if (err) {
+ vbd_free(&be->blkif->vbd);
+ be->major = be->minor = 0;
+ xenbus_dev_fatal(dev, err, "creating sysfs entries");
+ return;
+ }
+
+ /* We're potentially connected now */
+ update_blkif_status(be->blkif);
+
+ /* Add watch for cdrom media status if necessay */
+ cdrom_add_media_watch(be);
+ }
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev->dev.driver_data;
+ int err;
+
+ DPRINTK("%s", xenbus_strstate(frontend_state));
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ /* Ensure we connect even when two watches fire in
+ close successsion and we miss the intermediate value
+ of frontend_state. */
+ if (dev->state == XenbusStateConnected)
+ break;
+
+ err = connect_ring(be);
+ if (err)
+ break;
+ update_blkif_status(be->blkif);
+ break;
+
+ case XenbusStateClosing:
+ blkif_disconnect(be->blkif);
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = be->dev;
+
+ DPRINTK("%s", dev->otherend);
+
+ /* Supply the information about the device the frontend needs */
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ return;
+ }
+
+ err = blkback_barrier(xbt, be, 1);
+ if (err)
+ goto abort;
+
+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+ vbd_size(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sectors",
+ dev->nodename);
+ goto abort;
+ }
+
+ /* FIXME: use a typename instead */
+ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+ vbd_info(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/info",
+ dev->nodename);
+ goto abort;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+ vbd_secsize(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+ dev->nodename);
+ goto abort;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ xenbus_dev_fatal(dev, err, "ending transaction");
+
+ err = xenbus_switch_state(dev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(dev, err, "switching to Connected state",
+ dev->nodename);
+
+ return;
+ abort:
+ xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ unsigned long ring_ref;
+ unsigned int evtchn;
+ char protocol[64] = "";
+ int err;
+
+ DPRINTK("%s", dev->otherend);
+
+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+ "%63s", protocol, NULL);
+ if (err) {
+ strcpy(protocol, "unspecified");
+ be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
+ }
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+#if 1 /* maintain compatibility with early sles10-sp1 and paravirt netware betas */
+ else if (0 == strcmp(protocol, "1"))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+ else if (0 == strcmp(protocol, "2"))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+#endif
+ else {
+ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+ return -1;
+ }
+ printk(KERN_INFO
+ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+ ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+
+ /* Map the shared frame, irq etc. */
+ err = blkif_map(be->blkif, ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+ ring_ref, evtchn);
+ return err;
+ }
+
+ return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+ { "vbd" },
+ { "" }
+};
+
+
+static struct xenbus_driver blkback = {
+ .name = "vbd",
+ .ids = blkback_ids,
+ .probe = blkback_probe,
+ .remove = blkback_remove,
+ .otherend_changed = frontend_changed
+};
+
+
+void blkif_xenbus_init(void)
+{
+ if (xenbus_register_backend(&blkback))
+ BUG();
+}
--- /dev/null
+
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND) := xenblk.o
+
+xenblk-objs := blkfront.o vbd.o
+
--- /dev/null
+/******************************************************************************
+ * blkfront.c
+ *
+ * XenLinux virtual block-device driver.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include "block.h"
+#include <linux/cdrom.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <scsi/scsi.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/protocols.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <asm/maddr.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BLKIF_STATE_DISCONNECTED 0
+#define BLKIF_STATE_CONNECTED 1
+#define BLKIF_STATE_SUSPENDED 2
+
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+ (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+#define GRANT_INVALID_REF 0
+
+static void connect(struct blkfront_info *);
+static void blkfront_closing(struct xenbus_device *);
+static int blkfront_remove(struct xenbus_device *);
+static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
+static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
+
+static void kick_pending_request_queues(struct blkfront_info *);
+
+static irqreturn_t blkif_int(int irq, void *dev_id);
+static void blkif_restart_queue(struct work_struct *arg);
+static void blkif_recover(struct blkfront_info *);
+static void blkif_completion(struct blk_shadow *);
+static void blkif_free(struct blkfront_info *, int);
+
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those. Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err, vdevice, i;
+ struct blkfront_info *info;
+
+ /* FIXME: Use dynamic device id if this is not set. */
+ err = xenbus_scanf(XBT_NIL, dev->nodename,
+ "virtual-device", "%i", &vdevice);
+ if (err != 1) {
+ xenbus_dev_fatal(dev, err, "reading virtual-device");
+ return err;
+ }
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+ return -ENOMEM;
+ }
+
+ info->xbdev = dev;
+ info->vdevice = vdevice;
+ info->connected = BLKIF_STATE_DISCONNECTED;
+ INIT_WORK(&info->work, blkif_restart_queue);
+
+ for (i = 0; i < BLK_RING_SIZE; i++)
+ info->shadow[i].req.id = i+1;
+ info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+ /* Front end dir is a number, which is used as the id. */
+ info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
+ dev->dev.driver_data = info;
+
+ err = talk_to_backend(dev, info);
+ if (err) {
+ kfree(info);
+ dev->dev.driver_data = NULL;
+ return err;
+ }
+
+ return 0;
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart. We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+ struct blkfront_info *info = dev->dev.driver_data;
+ int err;
+
+ DPRINTK("blkfront_resume: %s\n", dev->nodename);
+
+ blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+ err = talk_to_backend(dev, info);
+ if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+ blkif_recover(info);
+
+ return err;
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+ struct blkfront_info *info)
+{
+ const char *message = NULL;
+ struct xenbus_transaction xbt;
+ int err;
+
+ /* Create shared ring, alloc event channel. */
+ err = setup_blkring(dev, info);
+ if (err)
+ goto out;
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ goto destroy_blkring;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-ref","%u", info->ring_ref);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+ irq_to_evtchn_port(info->irq));
+ if (err) {
+ message = "writing event-channel";
+ goto abort_transaction;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+ XEN_IO_PROTO_ABI_NATIVE);
+ if (err) {
+ message = "writing protocol";
+ goto abort_transaction;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err) {
+ if (err == -EAGAIN)
+ goto again;
+ xenbus_dev_fatal(dev, err, "completing transaction");
+ goto destroy_blkring;
+ }
+
+ xenbus_switch_state(dev, XenbusStateInitialised);
+
+ return 0;
+
+ abort_transaction:
+ xenbus_transaction_end(xbt, 1);
+ if (message)
+ xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+ blkif_free(info, 0);
+ out:
+ return err;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+ struct blkfront_info *info)
+{
+ blkif_sring_t *sring;
+ int err;
+
+ info->ring_ref = GRANT_INVALID_REF;
+
+ sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL|__GFP_HIGH);
+ if (!sring) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+ return -ENOMEM;
+ }
+ SHARED_RING_INIT(sring);
+ FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+ err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+ if (err < 0) {
+ free_page((unsigned long)sring);
+ info->ring.sring = NULL;
+ goto fail;
+ }
+ info->ring_ref = err;
+
+ err = bind_listening_port_to_irqhandler(
+ dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
+ if (err <= 0) {
+ xenbus_dev_fatal(dev, err,
+ "bind_listening_port_to_irqhandler");
+ goto fail;
+ }
+ info->irq = err;
+
+ return 0;
+fail:
+ blkif_free(info, 0);
+ return err;
+}
+
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+{
+ struct blkfront_info *info = dev->dev.driver_data;
+ struct block_device *bd;
+
+ DPRINTK("blkfront:backend_changed.\n");
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitWait:
+ case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ break;
+
+ case XenbusStateConnected:
+ connect(info);
+ break;
+
+ case XenbusStateClosing:
+ bd = bdget_disk(info->gd, 0);
+ if (bd == NULL)
+ xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+ down(&bd->bd_sem);
+#else
+ mutex_lock(&bd->bd_mutex);
+#endif
+ if (info->users > 0)
+ xenbus_dev_error(dev, -EBUSY,
+ "Device in use; refusing to close");
+ else
+ blkfront_closing(dev);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+ up(&bd->bd_sem);
+#else
+ mutex_unlock(&bd->bd_mutex);
+#endif
+ bdput(bd);
+ break;
+ }
+}
+
+
+/* ** Connection ** */
+
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void connect(struct blkfront_info *info)
+{
+ unsigned long long sectors;
+ unsigned long sector_size;
+ unsigned int binfo;
+ int err;
+
+ if ((info->connected == BLKIF_STATE_CONNECTED) ||
+ (info->connected == BLKIF_STATE_SUSPENDED) )
+ return;
+
+ DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "sectors", "%Lu", §ors,
+ "info", "%u", &binfo,
+ "sector-size", "%lu", §or_size,
+ NULL);
+ if (err) {
+ xenbus_dev_fatal(info->xbdev, err,
+ "reading backend fields at %s",
+ info->xbdev->otherend);
+ return;
+ }
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-barrier", "%lu", &info->feature_barrier,
+ NULL);
+ if (err)
+ info->feature_barrier = 0;
+
+ err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
+ if (err) {
+ xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+ info->xbdev->otherend);
+ return;
+ }
+
+ (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+ /* Kick pending requests. */
+ spin_lock_irq(&blkif_io_lock);
+ info->connected = BLKIF_STATE_CONNECTED;
+ kick_pending_request_queues(info);
+ spin_unlock_irq(&blkif_io_lock);
+
+ add_disk(info->gd);
+
+ info->is_ready = 1;
+}
+
+/**
+ * Handle the change of state of the backend to Closing. We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend. Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void blkfront_closing(struct xenbus_device *dev)
+{
+ struct blkfront_info *info = dev->dev.driver_data;
+ unsigned long flags;
+
+ DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
+
+ if (info->rq == NULL)
+ goto out;
+
+ spin_lock_irqsave(&blkif_io_lock, flags);
+ /* No more blkif_request(). */
+ blk_stop_queue(info->rq);
+ /* No more gnttab callback work. */
+ gnttab_cancel_free_callback(&info->callback);
+ spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+ /* Flush gnttab callback work. Must be done with no locks held. */
+ flush_scheduled_work();
+
+ xlvbd_del(info);
+
+ out:
+ xenbus_frontend_closed(dev);
+}
+
+
+static int blkfront_remove(struct xenbus_device *dev)
+{
+ struct blkfront_info *info = dev->dev.driver_data;
+
+ DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
+
+ blkif_free(info, 0);
+
+ kfree(info);
+
+ return 0;
+}
+
+
+static inline int GET_ID_FROM_FREELIST(
+ struct blkfront_info *info)
+{
+ unsigned long free = info->shadow_free;
+ BUG_ON(free > BLK_RING_SIZE);
+ info->shadow_free = info->shadow[free].req.id;
+ info->shadow[free].req.id = 0x0fffffee; /* debug */
+ return free;
+}
+
+static inline void ADD_ID_TO_FREELIST(
+ struct blkfront_info *info, unsigned long id)
+{
+ info->shadow[id].req.id = info->shadow_free;
+ info->shadow[id].request = 0;
+ info->shadow_free = id;
+}
+
+static inline void flush_requests(struct blkfront_info *info)
+{
+ int notify;
+
+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+
+ if (notify)
+ notify_remote_via_irq(info->irq);
+}
+
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+ if (!RING_FULL(&info->ring)) {
+ /* Re-enable calldowns. */
+ blk_start_queue(info->rq);
+ /* Kick things off immediately. */
+ do_blkif_request(info->rq);
+ }
+}
+
+static void blkif_restart_queue(struct work_struct *arg)
+{
+ struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
+ spin_lock_irq(&blkif_io_lock);
+ if (info->connected == BLKIF_STATE_CONNECTED)
+ kick_pending_request_queues(info);
+ spin_unlock_irq(&blkif_io_lock);
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+ struct blkfront_info *info = (struct blkfront_info *)arg;
+ schedule_work(&info->work);
+}
+
+int blkif_open(struct inode *inode, struct file *filep)
+{
+ struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+ info->users++;
+ return 0;
+}
+
+
+int blkif_release(struct inode *inode, struct file *filep)
+{
+ struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+ info->users--;
+ if (info->users == 0) {
+ /* Check whether we have been instructed to close. We will
+ have ignored this request initially, as the device was
+ still mounted. */
+ struct xenbus_device * dev = info->xbdev;
+ enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+
+ if (state == XenbusStateClosing && info->is_ready)
+ blkfront_closing(dev);
+ }
+ return 0;
+}
+
+
+int blkif_ioctl(struct inode *inode, struct file *filep,
+ unsigned command, unsigned long argument)
+{
+ int i;
+
+ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+ command, (long)argument, inode->i_rdev);
+
+ switch (command) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+ case HDIO_GETGEO: {
+ struct block_device *bd = inode->i_bdev;
+ struct hd_geometry geo;
+ int ret;
+
+ if (!argument)
+ return -EINVAL;
+
+ geo.start = get_start_sect(bd);
+ ret = blkif_getgeo(bd, &geo);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((struct hd_geometry __user *)argument, &geo,
+ sizeof(geo)))
+ return -EFAULT;
+
+ return 0;
+ }
+#endif
+ case CDROMMULTISESSION:
+ DPRINTK("FIXME: support multisession CDs later\n");
+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+ if (put_user(0, (char __user *)(argument + i)))
+ return -EFAULT;
+ return 0;
+
+ case CDROM_GET_CAPABILITY: {
+ struct blkfront_info *info =
+ inode->i_bdev->bd_disk->private_data;
+ struct gendisk *gd = info->gd;
+ if (gd->flags & GENHD_FL_CD)
+ return 0;
+ return -EINVAL;
+ }
+ default:
+ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+ command);*/
+ return -EINVAL; /* same return as native Linux */
+ }
+
+ return 0;
+}
+
+
+int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+ /* We don't have real geometry info, but let's at least return
+ values consistent with the size of the device */
+ sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t cylinders = nsect;
+
+ hg->heads = 0xff;
+ hg->sectors = 0x3f;
+ sector_div(cylinders, hg->heads * hg->sectors);
+ hg->cylinders = cylinders;
+ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+ hg->cylinders = 0xffff;
+ return 0;
+}
+
+
+/*
+ * blkif_queue_request
+ *
+ * request block io
+ *
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ * virtual address in the guest os.
+ */
+static int blkif_queue_request(struct request *req)
+{
+ struct blkfront_info *info = req->rq_disk->private_data;
+ unsigned long buffer_mfn;
+ blkif_request_t *ring_req;
+ struct bio_vec *bvec;
+ struct req_iterator iter;
+ unsigned long id;
+ unsigned int fsect, lsect;
+ int ref;
+ grant_ref_t gref_head;
+
+ if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+ return 1;
+
+ if (gnttab_alloc_grant_references(
+ BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
+ gnttab_request_free_callback(
+ &info->callback,
+ blkif_restart_queue_callback,
+ info,
+ BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ return 1;
+ }
+
+ /* Fill out a communications ring structure. */
+ ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+ id = GET_ID_FROM_FREELIST(info);
+ info->shadow[id].request = (unsigned long)req;
+
+ ring_req->id = id;
+ ring_req->sector_number = (blkif_sector_t)req->sector;
+ ring_req->handle = info->handle;
+
+ ring_req->operation = rq_data_dir(req) ?
+ BLKIF_OP_WRITE : BLKIF_OP_READ;
+ if (blk_barrier_rq(req))
+ ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+
+ ring_req->nr_segments = 0;
+ rq_for_each_segment(bvec, req, iter) {
+ BUG_ON(ring_req->nr_segments
+ == BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
+ fsect = bvec->bv_offset >> 9;
+ lsect = fsect + (bvec->bv_len >> 9) - 1;
+ /* install a grant reference. */
+ ref = gnttab_claim_grant_reference(&gref_head);
+ BUG_ON(ref == -ENOSPC);
+
+ gnttab_grant_foreign_access_ref(
+ ref,
+ info->xbdev->otherend_id,
+ buffer_mfn,
+ rq_data_dir(req) ? GTF_readonly : 0 );
+
+ info->shadow[id].frame[ring_req->nr_segments] =
+ mfn_to_pfn(buffer_mfn);
+
+ ring_req->seg[ring_req->nr_segments] =
+ (struct blkif_request_segment) {
+ .gref = ref,
+ .first_sect = fsect,
+ .last_sect = lsect };
+
+ ring_req->nr_segments++;
+ }
+
+ info->ring.req_prod_pvt++;
+
+ /* Keep a private copy so we can reissue requests when recovering. */
+ info->shadow[id].req = *ring_req;
+
+ gnttab_free_grant_references(gref_head);
+
+ return 0;
+}
+
+/*
+ * do_blkif_request
+ * read a block; request is in a request queue
+ */
+void do_blkif_request(struct request_queue *rq)
+{
+ struct blkfront_info *info = NULL;
+ struct request *req;
+ int queued;
+
+ DPRINTK("Entered do_blkif_request\n");
+
+ queued = 0;
+
+ while ((req = elv_next_request(rq)) != NULL) {
+ info = req->rq_disk->private_data;
+ if (!blk_fs_request(req)) {
+ end_request(req, 0);
+ continue;
+ }
+
+ if (RING_FULL(&info->ring))
+ goto wait;
+
+ DPRINTK("do_blk_req %p: cmd %p, sec %llx, "
+ "(%u/%li) buffer:%p [%s]\n",
+ req, req->cmd, (long long)req->sector,
+ req->current_nr_sectors,
+ req->nr_sectors, req->buffer,
+ rq_data_dir(req) ? "write" : "read");
+
+
+ blkdev_dequeue_request(req);
+ if (blkif_queue_request(req)) {
+ blk_requeue_request(rq, req);
+ wait:
+ /* Avoid pointless unplugs. */
+ blk_stop_queue(rq);
+ break;
+ }
+
+ queued++;
+ }
+
+ if (queued != 0)
+ flush_requests(info);
+}
+
+
+static irqreturn_t blkif_int(int irq, void *dev_id)
+{
+ struct request *req;
+ blkif_response_t *bret;
+ RING_IDX i, rp;
+ unsigned long flags;
+ struct blkfront_info *info = (struct blkfront_info *)dev_id;
+
+ spin_lock_irqsave(&blkif_io_lock, flags);
+
+ if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+ spin_unlock_irqrestore(&blkif_io_lock, flags);
+ return IRQ_HANDLED;
+ }
+
+ again:
+ rp = info->ring.sring->rsp_prod;
+ rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+ for (i = info->ring.rsp_cons; i != rp; i++) {
+ unsigned long id;
+ int ret;
+
+ bret = RING_GET_RESPONSE(&info->ring, i);
+ id = bret->id;
+ req = (struct request *)info->shadow[id].request;
+
+ blkif_completion(&info->shadow[id]);
+
+ ADD_ID_TO_FREELIST(info, id);
+
+ ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+ switch (bret->operation) {
+ case BLKIF_OP_WRITE_BARRIER:
+ if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+ printk("blkfront: %s: write barrier op failed\n",
+ info->gd->disk_name);
+ ret = -EOPNOTSUPP;
+ info->feature_barrier = 0;
+ xlvbd_barrier(info);
+ }
+ /* fall through */
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ if (unlikely(bret->status != BLKIF_RSP_OKAY))
+ DPRINTK("Bad return from blkdev data "
+ "request: %x\n", bret->status);
+
+ ret = __blk_end_request(req, ret, blk_rq_bytes(req));
+ BUG_ON(ret);
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ info->ring.rsp_cons = i;
+
+ if (i != info->ring.req_prod_pvt) {
+ int more_to_do;
+ RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+ if (more_to_do)
+ goto again;
+ } else
+ info->ring.sring->rsp_event = i + 1;
+
+ kick_pending_request_queues(info);
+
+ spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+ /* Prevent new requests being issued until we fix things up. */
+ spin_lock_irq(&blkif_io_lock);
+ info->connected = suspend ?
+ BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+ /* No more blkif_request(). */
+ if (info->rq)
+ blk_stop_queue(info->rq);
+ /* No more gnttab callback work. */
+ gnttab_cancel_free_callback(&info->callback);
+ spin_unlock_irq(&blkif_io_lock);
+
+ /* Flush gnttab callback work. Must be done with no locks held. */
+ flush_scheduled_work();
+
+ /* Free resources associated with old device channel. */
+ if (info->ring_ref != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access(info->ring_ref,
+ (unsigned long)info->ring.sring);
+ info->ring_ref = GRANT_INVALID_REF;
+ info->ring.sring = NULL;
+ }
+ if (info->irq)
+ unbind_from_irqhandler(info->irq, info);
+ info->irq = 0;
+}
+
+static void blkif_completion(struct blk_shadow *s)
+{
+ int i;
+ for (i = 0; i < s->req.nr_segments; i++)
+ gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
+}
+
+static void blkif_recover(struct blkfront_info *info)
+{
+ int i;
+ blkif_request_t *req;
+ struct blk_shadow *copy;
+ int j;
+
+ /* Stage 1: Make a safe copy of the shadow state. */
+ copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL | __GFP_HIGH);
+ memcpy(copy, info->shadow, sizeof(info->shadow));
+
+ /* Stage 2: Set up free list. */
+ memset(&info->shadow, 0, sizeof(info->shadow));
+ for (i = 0; i < BLK_RING_SIZE; i++)
+ info->shadow[i].req.id = i+1;
+ info->shadow_free = info->ring.req_prod_pvt;
+ info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+ /* Stage 3: Find pending requests and requeue them. */
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ /* Not in use? */
+ if (copy[i].request == 0)
+ continue;
+
+ /* Grab a request slot and copy shadow state into it. */
+ req = RING_GET_REQUEST(
+ &info->ring, info->ring.req_prod_pvt);
+ *req = copy[i].req;
+
+ /* We get a new request id, and must reset the shadow state. */
+ req->id = GET_ID_FROM_FREELIST(info);
+ memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i]));
+
+ /* Rewrite any grant references invalidated by susp/resume. */
+ for (j = 0; j < req->nr_segments; j++)
+ gnttab_grant_foreign_access_ref(
+ req->seg[j].gref,
+ info->xbdev->otherend_id,
+ pfn_to_mfn(info->shadow[req->id].frame[j]),
+ rq_data_dir((struct request *)
+ info->shadow[req->id].request) ?
+ GTF_readonly : 0);
+ info->shadow[req->id].req = *req;
+
+ info->ring.req_prod_pvt++;
+ }
+
+ kfree(copy);
+
+ (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+ spin_lock_irq(&blkif_io_lock);
+
+ /* Now safe for us to use the shared ring */
+ info->connected = BLKIF_STATE_CONNECTED;
+
+ /* Send off requeued requests */
+ flush_requests(info);
+
+ /* Kick any other new requests queued since we resumed */
+ kick_pending_request_queues(info);
+
+ spin_unlock_irq(&blkif_io_lock);
+}
+
+int blkfront_is_ready(struct xenbus_device *dev)
+{
+ struct blkfront_info *info = dev->dev.driver_data;
+
+ return info->is_ready;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkfront_ids[] = {
+ { "vbd" },
+ { "" }
+};
+MODULE_ALIAS("xen:vbd");
+
+static struct xenbus_driver blkfront = {
+ .name = "vbd",
+ .ids = blkfront_ids,
+ .probe = blkfront_probe,
+ .remove = blkfront_remove,
+ .resume = blkfront_resume,
+ .otherend_changed = backend_changed,
+ .is_ready = blkfront_is_ready,
+};
+
+
+static int __init xlblk_init(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ return xenbus_register_frontend(&blkfront);
+}
+module_init(xlblk_init);
+
+
+static void __exit xlblk_exit(void)
+{
+ return xenbus_unregister_driver(&blkfront);
+}
+module_exit(xlblk_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/******************************************************************************
+ * block.h
+ *
+ * Shared definitions between all levels of XenLinux Virtual block devices.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_DRIVERS_BLOCK_H__
+#define __XEN_DRIVERS_BLOCK_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/ring.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+struct xlbd_type_info
+{
+ int partn_shift;
+ int disks_per_major;
+ char *devname;
+ char *diskname;
+};
+
+struct xlbd_major_info
+{
+ int major;
+ int index;
+ int usage;
+ struct xlbd_type_info *type;
+};
+
+struct blk_shadow {
+ blkif_request_t req;
+ unsigned long request;
+ unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'. They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+ struct xenbus_device *xbdev;
+ struct gendisk *gd;
+ int vdevice;
+ blkif_vdev_t handle;
+ int connected;
+ int ring_ref;
+ blkif_front_ring_t ring;
+ unsigned int irq;
+ struct xlbd_major_info *mi;
+ struct request_queue *rq;
+ struct work_struct work;
+ struct gnttab_free_callback callback;
+ struct blk_shadow shadow[BLK_RING_SIZE];
+ unsigned long shadow_free;
+ int feature_barrier;
+ int is_ready;
+
+ /**
+ * The number of people holding this device open. We won't allow a
+ * hot-unplug unless this is 0.
+ */
+ int users;
+};
+
+extern spinlock_t blkif_io_lock;
+
+extern int blkif_open(struct inode *inode, struct file *filep);
+extern int blkif_release(struct inode *inode, struct file *filep);
+extern int blkif_ioctl(struct inode *inode, struct file *filep,
+ unsigned command, unsigned long argument);
+extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
+extern int blkif_check(dev_t dev);
+extern int blkif_revalidate(dev_t dev);
+extern void do_blkif_request (struct request_queue *rq);
+
+/* Virtual block-device subsystem. */
+/* Note that xlvbd_add doesn't call add_disk for you: you're expected
+ to call add_disk on info->gd once the disk is properly connected
+ up. */
+int xlvbd_add(blkif_sector_t capacity, int device,
+ u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
+void xlvbd_del(struct blkfront_info *info);
+int xlvbd_barrier(struct blkfront_info *info);
+
+#endif /* __XEN_DRIVERS_BLOCK_H__ */
--- /dev/null
+/******************************************************************************
+ * vbd.c
+ *
+ * XenLinux virtual block-device driver (xvd).
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "block.h"
+#include <linux/blkdev.h>
+#include <linux/list.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+/*
+ * For convenience we distinguish between ide, scsi and 'other' (i.e.,
+ * potentially combinations of the two) in the naming scheme and in a few other
+ * places.
+ */
+
+#define NUM_IDE_MAJORS 10
+#define NUM_SCSI_MAJORS 17
+#define NUM_VBD_MAJORS 1
+
+static struct xlbd_type_info xlbd_ide_type = {
+ .partn_shift = 6,
+ .disks_per_major = 2,
+ .devname = "ide",
+ .diskname = "hd",
+};
+
+static struct xlbd_type_info xlbd_scsi_type = {
+ .partn_shift = 4,
+ .disks_per_major = 16,
+ .devname = "sd",
+ .diskname = "sd",
+};
+
+static struct xlbd_type_info xlbd_vbd_type = {
+ .partn_shift = 4,
+ .disks_per_major = 16,
+ .devname = "xvd",
+ .diskname = "xvd",
+};
+
+static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
+ NUM_VBD_MAJORS];
+
+#define XLBD_MAJOR_IDE_START 0
+#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS)
+#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
+
+#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
+#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
+#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
+
+/* Information about our VBDs. */
+#define MAX_VBDS 64
+static LIST_HEAD(vbds_list);
+
+static struct block_device_operations xlvbd_block_fops =
+{
+ .owner = THIS_MODULE,
+ .open = blkif_open,
+ .release = blkif_release,
+ .ioctl = blkif_ioctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+ .getgeo = blkif_getgeo
+#endif
+};
+
+DEFINE_SPINLOCK(blkif_io_lock);
+
+static struct xlbd_major_info *
+xlbd_alloc_major_info(int major, int minor, int index)
+{
+ struct xlbd_major_info *ptr;
+
+ ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
+ if (ptr == NULL)
+ return NULL;
+
+ ptr->major = major;
+
+ switch (index) {
+ case XLBD_MAJOR_IDE_RANGE:
+ ptr->type = &xlbd_ide_type;
+ ptr->index = index - XLBD_MAJOR_IDE_START;
+ break;
+ case XLBD_MAJOR_SCSI_RANGE:
+ ptr->type = &xlbd_scsi_type;
+ ptr->index = index - XLBD_MAJOR_SCSI_START;
+ break;
+ case XLBD_MAJOR_VBD_RANGE:
+ ptr->type = &xlbd_vbd_type;
+ ptr->index = index - XLBD_MAJOR_VBD_START;
+ break;
+ }
+
+ if (register_blkdev(ptr->major, ptr->type->devname)) {
+ kfree(ptr);
+ return NULL;
+ }
+
+ printk("xen-vbd: registered block device major %i\n", ptr->major);
+ major_info[index] = ptr;
+ return ptr;
+}
+
+static struct xlbd_major_info *
+xlbd_get_major_info(int vdevice)
+{
+ struct xlbd_major_info *mi;
+ int major, minor, index;
+
+ major = BLKIF_MAJOR(vdevice);
+ minor = BLKIF_MINOR(vdevice);
+
+ switch (major) {
+ case IDE0_MAJOR: index = 0; break;
+ case IDE1_MAJOR: index = 1; break;
+ case IDE2_MAJOR: index = 2; break;
+ case IDE3_MAJOR: index = 3; break;
+ case IDE4_MAJOR: index = 4; break;
+ case IDE5_MAJOR: index = 5; break;
+ case IDE6_MAJOR: index = 6; break;
+ case IDE7_MAJOR: index = 7; break;
+ case IDE8_MAJOR: index = 8; break;
+ case IDE9_MAJOR: index = 9; break;
+ case SCSI_DISK0_MAJOR: index = 10; break;
+ case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
+ index = 11 + major - SCSI_DISK1_MAJOR;
+ break;
+ case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
+ index = 18 + major - SCSI_DISK8_MAJOR;
+ break;
+ case SCSI_CDROM_MAJOR: index = 26; break;
+ default: index = 27; break;
+ }
+
+ mi = ((major_info[index] != NULL) ? major_info[index] :
+ xlbd_alloc_major_info(major, minor, index));
+ if (mi)
+ mi->usage++;
+ return mi;
+}
+
+static void
+xlbd_put_major_info(struct xlbd_major_info *mi)
+{
+ mi->usage--;
+ /* XXX: release major if 0 */
+}
+
+static int
+xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+{
+ struct request_queue *rq;
+
+ rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+ if (rq == NULL)
+ return -1;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+ elevator_init(rq, "noop");
+#else
+ elevator_init(rq, &elevator_noop);
+#endif
+
+ /* Hard sector size and max sectors impersonate the equiv. hardware. */
+ blk_queue_hardsect_size(rq, sector_size);
+ blk_queue_max_sectors(rq, 512);
+
+ /* Each segment in a request is up to an aligned page in size. */
+ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+ blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+ /* Ensure a merged request will fit in a single I/O ring slot. */
+ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+ /* Make sure buffer addresses are sector-aligned. */
+ blk_queue_dma_alignment(rq, 511);
+
+ /* Make sure we don't use bounce buffers. */
+ blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
+
+ gd->queue = rq;
+
+ return 0;
+}
+
+int
+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
+ u16 sector_size, struct blkfront_info *info)
+{
+ int minor = BLKIF_MINOR(vdevice);
+ struct gendisk *gd;
+ struct xlbd_major_info *mi;
+ int nr_minors = 1;
+ int err = -ENODEV;
+ unsigned int offset;
+
+ BUG_ON(info->gd != NULL);
+ BUG_ON(info->mi != NULL);
+ BUG_ON(info->rq != NULL);
+
+ mi = xlbd_get_major_info(vdevice);
+ if (mi == NULL)
+ goto out;
+ info->mi = mi;
+
+ if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
+ nr_minors = 1 << mi->type->partn_shift;
+
+ gd = alloc_disk(nr_minors);
+ if (gd == NULL)
+ goto out;
+
+ offset = mi->index * mi->type->disks_per_major +
+ (minor >> mi->type->partn_shift);
+ if (nr_minors > 1) {
+ if (offset < 26) {
+ sprintf(gd->disk_name, "%s%c",
+ mi->type->diskname, 'a' + offset );
+ }
+ else {
+ sprintf(gd->disk_name, "%s%c%c",
+ mi->type->diskname,
+ 'a' + ((offset/26)-1), 'a' + (offset%26) );
+ }
+ }
+ else {
+ if (offset < 26) {
+ sprintf(gd->disk_name, "%s%c%d",
+ mi->type->diskname,
+ 'a' + offset,
+ minor & ((1 << mi->type->partn_shift) - 1));
+ }
+ else {
+ sprintf(gd->disk_name, "%s%c%c%d",
+ mi->type->diskname,
+ 'a' + ((offset/26)-1), 'a' + (offset%26),
+ minor & ((1 << mi->type->partn_shift) - 1));
+ }
+ }
+
+ gd->major = mi->major;
+ gd->first_minor = minor;
+ gd->fops = &xlvbd_block_fops;
+ gd->private_data = info;
+ gd->driverfs_dev = &(info->xbdev->dev);
+ set_capacity(gd, capacity);
+
+ if (xlvbd_init_blk_queue(gd, sector_size)) {
+ del_gendisk(gd);
+ goto out;
+ }
+
+ info->rq = gd->queue;
+ info->gd = gd;
+
+ if (info->feature_barrier)
+ xlvbd_barrier(info);
+
+ if (vdisk_info & VDISK_READONLY)
+ set_disk_ro(gd, 1);
+
+ if (vdisk_info & VDISK_REMOVABLE)
+ gd->flags |= GENHD_FL_REMOVABLE;
+
+ if (vdisk_info & VDISK_CDROM)
+ gd->flags |= GENHD_FL_CD;
+
+ return 0;
+
+ out:
+ if (mi)
+ xlbd_put_major_info(mi);
+ info->mi = NULL;
+ return err;
+}
+
+void
+xlvbd_del(struct blkfront_info *info)
+{
+ if (info->mi == NULL)
+ return;
+
+ BUG_ON(info->gd == NULL);
+ del_gendisk(info->gd);
+ put_disk(info->gd);
+ info->gd = NULL;
+
+ xlbd_put_major_info(info->mi);
+ info->mi = NULL;
+
+ BUG_ON(info->rq == NULL);
+ blk_cleanup_queue(info->rq);
+ info->rq = NULL;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+int
+xlvbd_barrier(struct blkfront_info *info)
+{
+ int err;
+
+ err = blk_queue_ordered(info->rq,
+ info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL);
+ if (err)
+ return err;
+ printk(KERN_INFO "blkfront: %s: barriers %s\n",
+ info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled");
+ return 0;
+}
+#else
+int
+xlvbd_barrier(struct blkfront_info *info)
+{
+ printk(KERN_INFO "blkfront: %s: barriers disabled\n", info->gd->disk_name);
+ return -ENOSYS;
+}
+#endif
--- /dev/null
+LINUXINCLUDE += -I../xen/include/public/io
+
+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
+
+blktap-y := xenbus.o interface.o blocktap.o
--- /dev/null
+/******************************************************************************
+ * drivers/xen/blktap/blktap.c
+ *
+ * Back-end driver for user level virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. Requests
+ * are remapped to a user-space memory region.
+ *
+ * Based on the blkback driver code.
+ *
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Clean ups and fix ups:
+ * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+#include <xen/balloon.h>
+#include <xen/driver_util.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <asm/tlbflush.h>
+
+#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
+#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM BLK_RING_SIZE
+#define MAX_PENDING_REQS BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg) \
+ (_start + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
+static int blkif_reqs = MAX_PENDING_REQS;
+static int mmap_pages = MMAP_PAGES;
+
+#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
+ * have a bunch of pages reserved for shared
+ * memory rings.
+ */
+
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+ unsigned short domid;
+ unsigned short busid;
+} domid_translate_t ;
+
+/*Data struct associated with each of the tapdisk devices*/
+typedef struct tap_blkif {
+ struct vm_area_struct *vma; /*Shared memory area */
+ unsigned long rings_vstart; /*Kernel memory mapping */
+ unsigned long user_vstart; /*User memory mapping */
+ unsigned long dev_inuse; /*One process opens device at a time. */
+ unsigned long dev_pending; /*In process of being opened */
+ unsigned long ring_ok; /*make this ring->state */
+ blkif_front_ring_t ufe_ring; /*Rings up to user space. */
+ wait_queue_head_t wait; /*for poll */
+ unsigned long mode; /*current switching mode */
+ int minor; /*Minor number for tapdisk device */
+ pid_t pid; /*tapdisk process id */
+ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
+ shutdown */
+ unsigned long *idx_map; /*Record the user ring id to kern
+ [req id, idx] tuple */
+ blkif_t *blkif; /*Associate blkif with tapdev */
+ struct domid_translate trans; /*Translation from domid to bus. */
+} tap_blkif_t;
+
+static struct tap_blkif *tapfds[MAX_TAP_DEV];
+static int blktap_next_minor;
+
+module_param(blkif_reqs, int, 0);
+/* Run-time switchable: /sys/module/blktap/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+ blkif_t *blkif;
+ u64 id;
+ unsigned short mem_idx;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+ int inuse;
+} pending_req_t;
+
+static pending_req_t *pending_reqs[MAX_PENDING_REQS];
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
+static int alloc_pending_reqs;
+
+typedef unsigned int PEND_RING_IDX;
+
+static inline int MASK_PEND_IDX(int i) {
+ return (i & (MAX_PENDING_REQS-1));
+}
+
+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
+ return (req - pending_reqs[idx]);
+}
+
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **foreign_pages[MAX_DYNAMIC_MEM];
+static inline unsigned long idx_to_kaddr(
+ unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
+{
+ unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
+ unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
+ return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+static unsigned short mmap_alloc = 0;
+static unsigned short mmap_lock = 0;
+static unsigned short mmap_inuse = 0;
+
+/******************************************************************
+ * GRANT HANDLES
+ */
+
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+ grant_handle_t kernel;
+ grant_handle_t user;
+};
+#define INVALID_GRANT_HANDLE 0xFFFF
+
+static struct grant_handle_pair
+ pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
+#define pending_handle(_id, _idx, _i) \
+ (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
+ + (_i)])
+
+
+static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
+
+#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
+#define BLKTAP_DEV_DIR "/dev/xen"
+
+static int blktap_major;
+
+/* blktap IOCTLs: */
+#define BLKTAP_IOCTL_KICK_FE 1
+#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
+#define BLKTAP_IOCTL_SETMODE 3
+#define BLKTAP_IOCTL_SENDPID 4
+#define BLKTAP_IOCTL_NEWINTF 5
+#define BLKTAP_IOCTL_MINOR 6
+#define BLKTAP_IOCTL_MAJOR 7
+#define BLKTAP_QUERY_ALLOC_REQS 8
+#define BLKTAP_IOCTL_FREEINTF 9
+#define BLKTAP_IOCTL_PRINT_IDXS 100
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
+#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
+#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
+
+#define BLKTAP_MODE_INTERPOSE \
+ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+ return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ (arg == BLKTAP_MODE_INTERCEPT_FE) ||
+ (arg == BLKTAP_MODE_INTERPOSE ));
+}
+
+/* Requests passing through the tap to userspace are re-assigned an ID.
+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
+ * ring ID.
+ */
+
+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
+{
+ return ((fe_dom << 16) | MASK_PEND_IDX(idx));
+}
+
+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
+{
+ return (PEND_RING_IDX)(id & 0x0000ffff);
+}
+
+extern inline int ID_TO_MIDX(unsigned long id)
+{
+ return (int)(id >> 16);
+}
+
+#define INVALID_REQ 0xdead0000
+
+/*TODO: Convert to a free list*/
+static inline int GET_NEXT_REQ(unsigned long *idx_map)
+{
+ int i;
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ if (idx_map[i] == INVALID_REQ)
+ return i;
+
+ return INVALID_REQ;
+}
+
+static inline int OFFSET_TO_USR_IDX(int offset)
+{
+ return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
+}
+
+static inline int OFFSET_TO_SEG(int offset)
+{
+ return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
+}
+
+
+#define BLKTAP_INVALID_HANDLE(_g) \
+ (((_g->kernel) == INVALID_GRANT_HANDLE) && \
+ ((_g->user) == INVALID_GRANT_HANDLE))
+
+#define BLKTAP_INVALIDATE_HANDLE(_g) do { \
+ (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
+ } while(0)
+
+
+/******************************************************************
+ * BLKTAP VM OPS
+ */
+
+static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ /*
+ * if the page has not been mapped in by the driver then return
+ * VM_FAULT_SIGBUS to the domain.
+ */
+
+ return VM_FAULT_SIGBUS;
+}
+
+static pte_t blktap_clear_pte(struct vm_area_struct *vma,
+ unsigned long uvaddr,
+ pte_t *ptep, int is_fullmm)
+{
+ pte_t copy;
+ tap_blkif_t *info;
+ int offset, seg, usr_idx, pending_idx, mmap_idx;
+ unsigned long uvstart = vma->vm_start + (RING_PAGES << PAGE_SHIFT);
+ unsigned long kvaddr;
+ struct page **map;
+ struct page *pg;
+ struct grant_handle_pair *khandle;
+ struct gnttab_unmap_grant_ref unmap[2];
+ int count = 0;
+
+ /*
+ * If the address is before the start of the grant mapped region or
+ * if vm_file is NULL (meaning mmap failed and we have nothing to do)
+ */
+ if (uvaddr < uvstart || vma->vm_file == NULL)
+ return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+ is_fullmm);
+
+ info = vma->vm_file->private_data;
+ map = vma->vm_private_data;
+
+ /* TODO Should these be changed to if statements? */
+ BUG_ON(!info);
+ BUG_ON(!info->idx_map);
+ BUG_ON(!map);
+
+ offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT);
+ usr_idx = OFFSET_TO_USR_IDX(offset);
+ seg = OFFSET_TO_SEG(offset);
+
+ pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
+ mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
+
+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ ClearPageReserved(pg);
+ map[offset + RING_PAGES] = NULL;
+
+ khandle = &pending_handle(mmap_idx, pending_idx, seg);
+
+ if (khandle->kernel != INVALID_GRANT_HANDLE) {
+ gnttab_set_unmap_op(&unmap[count], kvaddr,
+ GNTMAP_host_map, khandle->kernel);
+ count++;
+
+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+ INVALID_P2M_ENTRY);
+ }
+
+ if (khandle->user != INVALID_GRANT_HANDLE) {
+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+
+ copy = *ptep;
+ gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep),
+ GNTMAP_host_map
+ | GNTMAP_application_map
+ | GNTMAP_contains_pte,
+ khandle->user);
+ count++;
+ } else {
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
+
+ /* USING SHADOW PAGE TABLES. */
+ copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+ is_fullmm);
+ }
+
+ if (count) {
+ BLKTAP_INVALIDATE_HANDLE(khandle);
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ unmap, count))
+ BUG();
+ }
+
+ return copy;
+}
+
+struct vm_operations_struct blktap_vm_ops = {
+ fault: blktap_fault,
+ zap_pte: blktap_clear_pte,
+};
+
+/******************************************************************
+ * BLKTAP FILE OPS
+ */
+
+/*Function Declarations*/
+static tap_blkif_t *get_next_free_dev(void);
+static int blktap_open(struct inode *inode, struct file *filp);
+static int blktap_release(struct inode *inode, struct file *filp);
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+static unsigned int blktap_poll(struct file *file, poll_table *wait);
+
+static const struct file_operations blktap_fops = {
+ .owner = THIS_MODULE,
+ .poll = blktap_poll,
+ .ioctl = blktap_ioctl,
+ .open = blktap_open,
+ .release = blktap_release,
+ .mmap = blktap_mmap,
+};
+
+
+static tap_blkif_t *get_next_free_dev(void)
+{
+ struct class *class;
+ tap_blkif_t *info;
+ int minor;
+
+ /*
+ * This is called only from the ioctl, which
+ * means we should always have interrupts enabled.
+ */
+ BUG_ON(irqs_disabled());
+
+ spin_lock_irq(&pending_free_lock);
+
+ /* tapfds[0] is always NULL */
+
+ for (minor = 1; minor < blktap_next_minor; minor++) {
+ info = tapfds[minor];
+ /* we could have failed a previous attempt. */
+ if (!info ||
+ ((info->dev_inuse == 0) &&
+ (info->dev_pending == 0)) ) {
+ info->dev_pending = 1;
+ goto found;
+ }
+ }
+ info = NULL;
+ minor = -1;
+
+ /*
+ * We didn't find free device. If we can still allocate
+ * more, then we grab the next device minor that is
+ * available. This is done while we are still under
+ * the protection of the pending_free_lock.
+ */
+ if (blktap_next_minor < MAX_TAP_DEV)
+ minor = blktap_next_minor++;
+found:
+ spin_unlock_irq(&pending_free_lock);
+
+ if (!info && minor > 0) {
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (unlikely(!info)) {
+ /*
+ * If we failed here, try to put back
+ * the next minor number. But if one
+ * was just taken, then we just lose this
+ * minor. We can try to allocate this
+ * minor again later.
+ */
+ spin_lock_irq(&pending_free_lock);
+ if (blktap_next_minor == minor+1)
+ blktap_next_minor--;
+ spin_unlock_irq(&pending_free_lock);
+ goto out;
+ }
+
+ info->minor = minor;
+ /*
+ * Make sure that we have a minor before others can
+ * see us.
+ */
+ wmb();
+ tapfds[minor] = info;
+
+ if ((class = get_xen_class()) != NULL)
+ device_create(class, NULL, MKDEV(blktap_major, minor),
+ "blktap%d", minor);
+ }
+
+out:
+ return info;
+}
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
+{
+ tap_blkif_t *info;
+ int i;
+
+ for (i = 1; i < blktap_next_minor; i++) {
+ info = tapfds[i];
+ if ( info &&
+ (info->trans.domid == domid) &&
+ (info->trans.busid == xenbus_id) ) {
+ info->blkif = blkif;
+ info->status = RUNNING;
+ return i;
+ }
+ }
+ return -1;
+}
+
+void signal_tapdisk(int idx)
+{
+ tap_blkif_t *info;
+ struct task_struct *ptask;
+
+ info = tapfds[idx];
+ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
+ return;
+
+ if (info->pid > 0) {
+ ptask = find_task_by_pid(info->pid);
+ if (ptask)
+ info->status = CLEANSHUTDOWN;
+ }
+ info->blkif = NULL;
+
+ return;
+}
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+ blkif_sring_t *sring;
+ int idx = iminor(inode) - BLKTAP_MINOR;
+ tap_blkif_t *info;
+ int i;
+
+ /* ctrl device, treat differently */
+ if (!idx)
+ return 0;
+
+ info = tapfds[idx];
+
+ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
+ WPRINTK("Unable to open device /dev/xen/blktap%d\n",
+ idx);
+ return -ENODEV;
+ }
+
+ DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
+
+ /*Only one process can access device at a time*/
+ if (test_and_set_bit(0, &info->dev_inuse))
+ return -EBUSY;
+
+ info->dev_pending = 0;
+
+ /* Allocate the fe ring. */
+ sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+ if (sring == NULL)
+ goto fail_nomem;
+
+ SetPageReserved(virt_to_page(sring));
+
+ SHARED_RING_INIT(sring);
+ FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
+
+ filp->private_data = info;
+ info->vma = NULL;
+
+ info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
+ GFP_KERNEL);
+
+ if (info->idx_map == NULL)
+ goto fail_nomem;
+
+ if (idx > 0) {
+ init_waitqueue_head(&info->wait);
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ info->idx_map[i] = INVALID_REQ;
+ }
+
+ DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
+ return 0;
+
+ fail_nomem:
+ return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+ tap_blkif_t *info = filp->private_data;
+
+ /* check for control device */
+ if (!info)
+ return 0;
+
+ info->dev_inuse = 0;
+ DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
+
+ /* Free the ring page. */
+ ClearPageReserved(virt_to_page(info->ufe_ring.sring));
+ free_page((unsigned long) info->ufe_ring.sring);
+
+ /* Clear any active mappings and free foreign map table */
+ if (info->vma) {
+ zap_page_range(
+ info->vma, info->vma->vm_start,
+ info->vma->vm_end - info->vma->vm_start, NULL);
+
+ kfree(info->vma->vm_private_data);
+
+ info->vma = NULL;
+ }
+
+ if (info->idx_map) {
+ kfree(info->idx_map);
+ info->idx_map = NULL;
+ }
+
+ if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
+ if (info->blkif->xenblkd != NULL) {
+ kthread_stop(info->blkif->xenblkd);
+ info->blkif->xenblkd = NULL;
+ }
+ info->status = CLEANSHUTDOWN;
+ }
+
+ return 0;
+}
+
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them. This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space. This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms. vma->vm_private_data is set up as a mapping
+ * from pages to actual page structs. There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ int size;
+ struct page **map;
+ int i;
+ tap_blkif_t *info = filp->private_data;
+ int ret;
+
+ if (info == NULL) {
+ WPRINTK("blktap: mmap, retrieving idx failed\n");
+ return -ENOMEM;
+ }
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_ops = &blktap_vm_ops;
+
+ size = vma->vm_end - vma->vm_start;
+ if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
+ WPRINTK("you _must_ map exactly %d pages!\n",
+ mmap_pages + RING_PAGES);
+ return -EAGAIN;
+ }
+
+ size >>= PAGE_SHIFT;
+ info->rings_vstart = vma->vm_start;
+ info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
+
+ /* Map the ring pages to the start of the region and reserve it. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ ret = vm_insert_page(vma, vma->vm_start,
+ virt_to_page(info->ufe_ring.sring));
+ else
+ ret = remap_pfn_range(vma, vma->vm_start,
+ __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot);
+ if (ret) {
+ WPRINTK("Mapping user ring failed!\n");
+ goto fail;
+ }
+
+ /* Mark this VM as containing foreign pages, and set up mappings. */
+ map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
+ * sizeof(struct page *),
+ GFP_KERNEL);
+ if (map == NULL) {
+ WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
+ goto fail;
+ }
+
+ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
+ map[i] = NULL;
+
+ vma->vm_private_data = map;
+ vma->vm_flags |= VM_FOREIGN;
+ vma->vm_flags |= VM_DONTCOPY;
+
+#ifdef CONFIG_X86
+ vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+ info->vma = vma;
+ info->ring_ok = 1;
+ return 0;
+ fail:
+ /* Clear any active mappings. */
+ zap_page_range(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, NULL);
+
+ return -ENOMEM;
+}
+
+
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ tap_blkif_t *info = filp->private_data;
+
+ switch(cmd) {
+ case BLKTAP_IOCTL_KICK_FE:
+ {
+ /* There are fe messages to process. */
+ return blktap_read_ufe_ring(info);
+ }
+ case BLKTAP_IOCTL_SETMODE:
+ {
+ if (info) {
+ if (BLKTAP_MODE_VALID(arg)) {
+ info->mode = arg;
+ /* XXX: may need to flush rings here. */
+ DPRINTK("blktap: set mode to %lx\n",
+ arg);
+ return 0;
+ }
+ }
+ return 0;
+ }
+ case BLKTAP_IOCTL_PRINT_IDXS:
+ {
+ if (info) {
+ printk("User Rings: \n-----------\n");
+ printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
+ "| req_prod: %2d, rsp_prod: %2d\n",
+ info->ufe_ring.rsp_cons,
+ info->ufe_ring.req_prod_pvt,
+ info->ufe_ring.sring->req_prod,
+ info->ufe_ring.sring->rsp_prod);
+ }
+ return 0;
+ }
+ case BLKTAP_IOCTL_SENDPID:
+ {
+ if (info) {
+ info->pid = (pid_t)arg;
+ DPRINTK("blktap: pid received %d\n",
+ info->pid);
+ }
+ return 0;
+ }
+ case BLKTAP_IOCTL_NEWINTF:
+ {
+ uint64_t val = (uint64_t)arg;
+ domid_translate_t *tr = (domid_translate_t *)&val;
+
+ DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
+ tr->domid, tr->busid);
+ info = get_next_free_dev();
+ if (!info) {
+ WPRINTK("Error initialising /dev/xen/blktap - "
+ "No more devices\n");
+ return -1;
+ }
+ info->trans.domid = tr->domid;
+ info->trans.busid = tr->busid;
+ return info->minor;
+ }
+ case BLKTAP_IOCTL_FREEINTF:
+ {
+ unsigned long dev = arg;
+ unsigned long flags;
+
+ info = tapfds[dev];
+
+ if ((dev > MAX_TAP_DEV) || !info)
+ return 0; /* should this be an error? */
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+ if (info->dev_pending)
+ info->dev_pending = 0;
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+
+ return 0;
+ }
+ case BLKTAP_IOCTL_MINOR:
+ {
+ unsigned long dev = arg;
+
+ info = tapfds[dev];
+
+ if ((dev > MAX_TAP_DEV) || !info)
+ return -EINVAL;
+
+ return info->minor;
+ }
+ case BLKTAP_IOCTL_MAJOR:
+ return blktap_major;
+
+ case BLKTAP_QUERY_ALLOC_REQS:
+ {
+ WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
+ alloc_pending_reqs, blkif_reqs);
+ return (alloc_pending_reqs/blkif_reqs) * 100;
+ }
+ }
+ return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *filp, poll_table *wait)
+{
+ tap_blkif_t *info = filp->private_data;
+
+ /* do not work on the control device */
+ if (!info)
+ return 0;
+
+ poll_wait(filp, &info->wait, wait);
+ if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
+ RING_PUSH_REQUESTS(&info->ufe_ring);
+ return POLLIN | POLLRDNORM;
+ }
+ return 0;
+}
+
+void blktap_kick_user(int idx)
+{
+ tap_blkif_t *info;
+
+ info = tapfds[idx];
+
+ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
+ return;
+
+ wake_up_interruptible(&info->wait);
+
+ return;
+}
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static int req_increase(void)
+{
+ int i, j;
+
+ if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
+ return -EINVAL;
+
+ pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
+ * blkif_reqs, GFP_KERNEL);
+ foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
+
+ if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
+ goto out_of_memory;
+
+ DPRINTK("%s: reqs=%d, pages=%d\n",
+ __FUNCTION__, blkif_reqs, mmap_pages);
+
+ for (i = 0; i < MAX_PENDING_REQS; i++) {
+ list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
+ &pending_free);
+ pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
+ for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+ BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
+ i, j));
+ }
+
+ mmap_alloc++;
+ DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
+ return 0;
+
+ out_of_memory:
+ free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
+ kfree(pending_reqs[mmap_alloc]);
+ WPRINTK("%s: out of memory\n", __FUNCTION__);
+ return -ENOMEM;
+}
+
+static void mmap_req_del(int mmap)
+{
+ BUG_ON(!spin_is_locked(&pending_free_lock));
+
+ kfree(pending_reqs[mmap]);
+ pending_reqs[mmap] = NULL;
+
+ free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
+ foreign_pages[mmap] = NULL;
+
+ mmap_lock = 0;
+ DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
+ mmap_alloc--;
+}
+
+static pending_req_t* alloc_req(void)
+{
+ pending_req_t *req = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+
+ if (!list_empty(&pending_free)) {
+ req = list_entry(pending_free.next, pending_req_t, free_list);
+ list_del(&req->free_list);
+ }
+
+ if (req) {
+ req->inuse = 1;
+ alloc_pending_reqs++;
+ }
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+
+ return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+ unsigned long flags;
+ int was_empty;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+
+ alloc_pending_reqs--;
+ req->inuse = 0;
+ if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
+ mmap_inuse--;
+ if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return;
+ }
+ was_empty = list_empty(&pending_free);
+ list_add(&req->free_list, &pending_free);
+
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+
+ if (was_empty)
+ wake_up(&pending_free_wq);
+}
+
+static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
+ int tapidx)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ unsigned int i, invcount = 0;
+ struct grant_handle_pair *khandle;
+ uint64_t ptep;
+ int ret, mmap_idx;
+ unsigned long kvaddr, uvaddr;
+ tap_blkif_t *info;
+
+
+ info = tapfds[tapidx];
+
+ if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
+ WPRINTK("fast_flush: Couldn't get info!\n");
+ return;
+ }
+
+ if (info->vma != NULL &&
+ xen_feature(XENFEAT_auto_translated_physmap)) {
+ down_write(&info->vma->vm_mm->mmap_sem);
+ zap_page_range(info->vma,
+ MMAP_VADDR(info->user_vstart, u_idx, 0),
+ req->nr_pages << PAGE_SHIFT, NULL);
+ up_write(&info->vma->vm_mm->mmap_sem);
+ return;
+ }
+
+ mmap_idx = req->mem_idx;
+
+ for (i = 0; i < req->nr_pages; i++) {
+ kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
+ uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
+
+ khandle = &pending_handle(mmap_idx, k_idx, i);
+
+ if (khandle->kernel != INVALID_GRANT_HANDLE) {
+ gnttab_set_unmap_op(&unmap[invcount],
+ idx_to_kaddr(mmap_idx, k_idx, i),
+ GNTMAP_host_map, khandle->kernel);
+ invcount++;
+
+ set_phys_to_machine(
+ __pa(idx_to_kaddr(mmap_idx, k_idx, i))
+ >> PAGE_SHIFT, INVALID_P2M_ENTRY);
+ }
+
+ if (khandle->user != INVALID_GRANT_HANDLE) {
+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+ if (create_lookup_pte_addr(
+ info->vma->vm_mm,
+ MMAP_VADDR(info->user_vstart, u_idx, i),
+ &ptep) !=0) {
+ WPRINTK("Couldn't get a pte addr!\n");
+ return;
+ }
+
+ gnttab_set_unmap_op(&unmap[invcount], ptep,
+ GNTMAP_host_map
+ | GNTMAP_application_map
+ | GNTMAP_contains_pte,
+ khandle->user);
+ invcount++;
+ }
+
+ BLKTAP_INVALIDATE_HANDLE(khandle);
+ }
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, unmap, invcount);
+ BUG_ON(ret);
+
+ if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
+ zap_page_range(info->vma,
+ MMAP_VADDR(info->user_vstart, u_idx, 0),
+ req->nr_pages << PAGE_SHIFT, NULL);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
+ current->comm, blkif->st_oo_req,
+ blkif->st_rd_req, blkif->st_wr_req);
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+ blkif->st_rd_req = 0;
+ blkif->st_wr_req = 0;
+ blkif->st_oo_req = 0;
+}
+
+int tap_blkif_schedule(void *arg)
+{
+ blkif_t *blkif = arg;
+
+ blkif_get(blkif);
+
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: started\n", current->comm);
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
+ wait_event_interruptible(
+ blkif->wq,
+ blkif->waiting_reqs || kthread_should_stop());
+ wait_event_interruptible(
+ pending_free_wq,
+ !list_empty(&pending_free) || kthread_should_stop());
+
+ blkif->waiting_reqs = 0;
+ smp_mb(); /* clear flag *before* checking for work */
+
+ if (do_block_io_op(blkif))
+ blkif->waiting_reqs = 1;
+
+ if (log_stats && time_after(jiffies, blkif->st_print))
+ print_stats(blkif);
+ }
+
+ if (log_stats)
+ print_stats(blkif);
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+ blkif->xenblkd = NULL;
+ blkif_put(blkif);
+
+ return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called by user level ioctl()
+ */
+
+static int blktap_read_ufe_ring(tap_blkif_t *info)
+{
+ /* This is called to read responses from the UFE ring. */
+ RING_IDX i, j, rp;
+ blkif_response_t *resp;
+ blkif_t *blkif=NULL;
+ int pending_idx, usr_idx, mmap_idx;
+ pending_req_t *pending_req;
+
+ if (!info)
+ return 0;
+
+ /* We currently only forward packets in INTERCEPT_FE mode. */
+ if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
+ return 0;
+
+ /* for each outstanding message on the UFEring */
+ rp = info->ufe_ring.sring->rsp_prod;
+ rmb();
+
+ for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
+ blkif_response_t res;
+ resp = RING_GET_RESPONSE(&info->ufe_ring, i);
+ memcpy(&res, resp, sizeof(res));
+ mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
+ ++info->ufe_ring.rsp_cons;
+
+ /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
+ usr_idx = (int)res.id;
+ pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
+ mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
+
+ if ( (mmap_idx >= mmap_alloc) ||
+ (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
+ WPRINTK("Incorrect req map"
+ "[%d], internal map [%d,%d (%d)]\n",
+ usr_idx, mmap_idx,
+ ID_TO_IDX(info->idx_map[usr_idx]),
+ MASK_PEND_IDX(
+ ID_TO_IDX(info->idx_map[usr_idx])));
+
+ pending_req = &pending_reqs[mmap_idx][pending_idx];
+ blkif = pending_req->blkif;
+
+ for (j = 0; j < pending_req->nr_pages; j++) {
+
+ unsigned long kvaddr, uvaddr;
+ struct page **map = info->vma->vm_private_data;
+ struct page *pg;
+ int offset;
+
+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
+
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ ClearPageReserved(pg);
+ offset = (uvaddr - info->vma->vm_start)
+ >> PAGE_SHIFT;
+ map[offset] = NULL;
+ }
+ fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
+ info->idx_map[usr_idx] = INVALID_REQ;
+ make_response(blkif, pending_req->id, res.operation,
+ res.status);
+ blkif_put(pending_req->blkif);
+ free_req(pending_req);
+ }
+
+ return 0;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+ blkif->waiting_reqs = 1;
+ wake_up(&blkif->wq);
+}
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
+{
+ blkif_notify_work(dev_id);
+ return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+static int print_dbug = 1;
+static int do_block_io_op(blkif_t *blkif)
+{
+ blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+ blkif_request_t req;
+ pending_req_t *pending_req;
+ RING_IDX rc, rp;
+ int more_to_do = 0;
+ tap_blkif_t *info;
+
+ rc = blk_rings->common.req_cons;
+ rp = blk_rings->common.sring->req_prod;
+ rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ /*Check blkif has corresponding UE ring*/
+ if (blkif->dev_num < 0) {
+ /*oops*/
+ if (print_dbug) {
+ WPRINTK("Corresponding UE "
+ "ring does not exist!\n");
+ print_dbug = 0; /*We only print this message once*/
+ }
+ return 0;
+ }
+
+ info = tapfds[blkif->dev_num];
+
+ if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
+ if (print_dbug) {
+ WPRINTK("Can't get UE info!\n");
+ print_dbug = 0;
+ }
+ return 0;
+ }
+
+ while (rc != rp) {
+
+ if (RING_FULL(&info->ufe_ring)) {
+ WPRINTK("RING_FULL! More to do\n");
+ more_to_do = 1;
+ break;
+ }
+
+ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
+ WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
+ " More to do\n");
+ more_to_do = 1;
+ break;
+ }
+
+ pending_req = alloc_req();
+ if (NULL == pending_req) {
+ blkif->st_oo_req++;
+ more_to_do = 1;
+ break;
+ }
+
+ if (kthread_should_stop()) {
+ more_to_do = 1;
+ break;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
+ sizeof(req));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+ switch (req.operation) {
+ case BLKIF_OP_READ:
+ blkif->st_rd_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+
+ case BLKIF_OP_WRITE:
+ blkif->st_wr_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+
+ default:
+ /* A good sign something is wrong: sleep for a while to
+ * avoid excessive CPU consumption by a bad guest. */
+ msleep(1);
+ WPRINTK("unknown operation [%d]\n",
+ req.operation);
+ make_response(blkif, req.id, req.operation,
+ BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ break;
+ }
+
+ /* Yield point for this unbounded loop. */
+ cond_resched();
+ }
+
+ blktap_kick_user(blkif->dev_num);
+
+ return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req)
+{
+ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
+ int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ unsigned int nseg;
+ int ret, i, nr_sects = 0;
+ tap_blkif_t *info;
+ blkif_request_t *target;
+ int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
+ int usr_idx;
+ uint16_t mmap_idx = pending_req->mem_idx;
+
+ if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
+ goto fail_response;
+
+ info = tapfds[blkif->dev_num];
+ if (info == NULL)
+ goto fail_response;
+
+ /* Check we have space on user ring - should never fail. */
+ usr_idx = GET_NEXT_REQ(info->idx_map);
+ if (usr_idx == INVALID_REQ) {
+ BUG();
+ goto fail_response;
+ }
+
+ /* Check that number of segments is sane. */
+ nseg = req->nr_segments;
+ if ( unlikely(nseg == 0) ||
+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
+ WPRINTK("Bad number of segments in request (%d)\n", nseg);
+ goto fail_response;
+ }
+
+ /* Make sure userspace is ready. */
+ if (!info->ring_ok) {
+ WPRINTK("blktap: ring not ready for requests!\n");
+ goto fail_response;
+ }
+
+ if (RING_FULL(&info->ufe_ring)) {
+ WPRINTK("blktap: fe_ring is full, can't add "
+ "IO Request will be dropped. %d %d\n",
+ RING_SIZE(&info->ufe_ring),
+ RING_SIZE(&blkif->blk_rings.common));
+ goto fail_response;
+ }
+
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+ op = 0;
+ for (i = 0; i < nseg; i++) {
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ uint64_t ptep;
+ uint32_t flags;
+
+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
+
+ flags = GNTMAP_host_map;
+ if (operation == WRITE)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[op], kvaddr, flags,
+ req->seg[i].gref, blkif->domid);
+ op++;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Now map it to user. */
+ ret = create_lookup_pte_addr(info->vma->vm_mm,
+ uvaddr, &ptep);
+ if (ret) {
+ WPRINTK("Couldn't get a pte addr!\n");
+ goto fail_flush;
+ }
+
+ flags = GNTMAP_host_map | GNTMAP_application_map
+ | GNTMAP_contains_pte;
+ if (operation == WRITE)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[op], ptep, flags,
+ req->seg[i].gref, blkif->domid);
+ op++;
+ }
+
+ nr_sects += (req->seg[i].last_sect -
+ req->seg[i].first_sect + 1);
+ }
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
+ BUG_ON(ret);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ for (i = 0; i < (nseg*2); i+=2) {
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long offset;
+ struct page *pg;
+
+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
+
+ if (unlikely(map[i].status != 0)) {
+ WPRINTK("invalid kernel buffer -- "
+ "could not remap it\n");
+ ret |= 1;
+ map[i].handle = INVALID_GRANT_HANDLE;
+ }
+
+ if (unlikely(map[i+1].status != 0)) {
+ WPRINTK("invalid user buffer -- "
+ "could not remap it\n");
+ ret |= 1;
+ map[i+1].handle = INVALID_GRANT_HANDLE;
+ }
+
+ pending_handle(mmap_idx, pending_idx, i/2).kernel
+ = map[i].handle;
+ pending_handle(mmap_idx, pending_idx, i/2).user
+ = map[i+1].handle;
+
+ if (ret)
+ continue;
+
+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+ FOREIGN_FRAME(map[i].dev_bus_addr
+ >> PAGE_SHIFT));
+ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ ((struct page **)info->vma->vm_private_data)[offset] =
+ pg;
+ }
+ } else {
+ for (i = 0; i < nseg; i++) {
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long offset;
+ struct page *pg;
+
+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
+
+ if (unlikely(map[i].status != 0)) {
+ WPRINTK("invalid kernel buffer -- "
+ "could not remap it\n");
+ ret |= 1;
+ map[i].handle = INVALID_GRANT_HANDLE;
+ }
+
+ pending_handle(mmap_idx, pending_idx, i).kernel
+ = map[i].handle;
+
+ if (ret)
+ continue;
+
+ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ ((struct page **)info->vma->vm_private_data)[offset] =
+ pg;
+ }
+ }
+
+ if (ret)
+ goto fail_flush;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ down_write(&info->vma->vm_mm->mmap_sem);
+ /* Mark mapped pages as reserved: */
+ for (i = 0; i < req->nr_segments; i++) {
+ unsigned long kvaddr;
+ struct page *pg;
+
+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ SetPageReserved(pg);
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ ret = vm_insert_page(info->vma,
+ MMAP_VADDR(info->user_vstart,
+ usr_idx, i), pg);
+ if (ret) {
+ up_write(&info->vma->vm_mm->mmap_sem);
+ goto fail_flush;
+ }
+ }
+ }
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ up_write(&info->vma->vm_mm->mmap_sem);
+
+ /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
+ info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
+
+ blkif_get(blkif);
+ /* Finally, write the request message to the user ring. */
+ target = RING_GET_REQUEST(&info->ufe_ring,
+ info->ufe_ring.req_prod_pvt);
+ memcpy(target, req, sizeof(*req));
+ target->id = usr_idx;
+ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
+ info->ufe_ring.req_prod_pvt++;
+
+ if (operation == READ)
+ blkif->st_rd_sect += nr_sects;
+ else if (operation == WRITE)
+ blkif->st_wr_sect += nr_sects;
+
+ return;
+
+ fail_flush:
+ WPRINTK("Reached Fail_flush\n");
+ fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
+ fail_response:
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ msleep(1); /* back off a bit */
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st)
+{
+ blkif_response_t resp;
+ unsigned long flags;
+ blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+ int more_to_do = 0;
+ int notify;
+
+ resp.id = id;
+ resp.operation = op;
+ resp.status = st;
+
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ /* Place on the response ring for the relevant domain. */
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(RING_GET_RESPONSE(&blk_rings->native,
+ blk_rings->native.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
+ blk_rings->x86_32.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
+ blk_rings->x86_64.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.rsp_prod_pvt++;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+
+ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+ /*
+ * Tail check for pending requests. Allows frontend to avoid
+ * notifications if requests are already in flight (lower
+ * overheads and promotes batching).
+ */
+ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+ more_to_do = 1;
+ }
+
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+ if (more_to_do)
+ blkif_notify_work(blkif);
+ if (notify)
+ notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+ int i, ret;
+ struct class *class;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&pending_free);
+ for(i = 0; i < 2; i++) {
+ ret = req_increase();
+ if (ret)
+ break;
+ }
+ if (i == 0)
+ return ret;
+
+ tap_blkif_interface_init();
+
+ alloc_pending_reqs = 0;
+
+ tap_blkif_xenbus_init();
+
+ /* Dynamically allocate a major for this device */
+ ret = register_chrdev(0, "blktap", &blktap_fops);
+
+ if (ret < 0) {
+ WPRINTK("Couldn't register /dev/xen/blktap\n");
+ return -ENOMEM;
+ }
+
+ blktap_major = ret;
+
+ /* tapfds[0] is always NULL */
+ blktap_next_minor++;
+
+ DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
+
+ /* Make sure the xen class exists */
+ if ((class = get_xen_class()) != NULL) {
+ /*
+ * This will allow udev to create the blktap ctrl device.
+ * We only want to create blktap0 first. We don't want
+ * to flood the sysfs system with needless blktap devices.
+ * We only create the device when a request of a new device is
+ * made.
+ */
+ device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
+ } else {
+ /* this is bad, but not fatal */
+ WPRINTK("blktap: sysfs xen_class not created\n");
+ }
+
+ DPRINTK("Blktap device successfully created\n");
+
+ return 0;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+#include "blktap.c"
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/gnttab.h>
+#include <xen/driver_util.h>
+
+#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
+
+struct backend_info;
+
+typedef struct blkif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned int irq;
+ /* Comms information. */
+ enum blkif_protocol blk_protocol;
+ blkif_back_rings_t blk_rings;
+ struct vm_struct *blk_ring_area;
+ /* Back pointer to the backend_info. */
+ struct backend_info *be;
+ /* Private fields. */
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+
+ wait_queue_head_t wq;
+ struct task_struct *xenblkd;
+ unsigned int waiting_reqs;
+ struct request_queue *plug;
+
+ /* statistics */
+ unsigned long st_print;
+ int st_rd_req;
+ int st_wr_req;
+ int st_oo_req;
+ int st_rd_sect;
+ int st_wr_sect;
+
+ wait_queue_head_t waiting_to_free;
+
+ grant_handle_t shmem_handle;
+ grant_ref_t shmem_ref;
+
+ int dev_num;
+ uint64_t sectors;
+} blkif_t;
+
+blkif_t *tap_alloc_blkif(domid_t domid);
+void tap_blkif_free(blkif_t *blkif);
+int tap_blkif_map(blkif_t *blkif, unsigned long shared_page,
+ unsigned int evtchn);
+void tap_blkif_unmap(blkif_t *blkif);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b) \
+ do { \
+ if (atomic_dec_and_test(&(_b)->refcnt)) \
+ wake_up(&(_b)->waiting_to_free);\
+ } while (0)
+
+
+struct phys_req {
+ unsigned short dev;
+ unsigned short nr_sects;
+ struct block_device *bdev;
+ blkif_sector_t sector_number;
+};
+
+void tap_blkif_interface_init(void);
+
+void tap_blkif_xenbus_init(void);
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
+int tap_blkif_schedule(void *arg);
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
+void signal_tapdisk(int idx);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
--- /dev/null
+/******************************************************************************
+ * drivers/xen/blktap/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *tap_alloc_blkif(domid_t domid)
+{
+ blkif_t *blkif;
+
+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+ if (!blkif)
+ return ERR_PTR(-ENOMEM);
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 1);
+ init_waitqueue_head(&blkif->wq);
+ blkif->st_print = jiffies;
+ init_waitqueue_head(&blkif->waiting_to_free);
+
+ return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, shared_page, blkif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Grant table operation failure !\n");
+ return op.status;
+ }
+
+ blkif->shmem_ref = shared_page;
+ blkif->shmem_handle = op.handle;
+
+ return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, blkif->shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+}
+
+int tap_blkif_map(blkif_t *blkif, unsigned long shared_page,
+ unsigned int evtchn)
+{
+ int err;
+
+ /* Already connected through? */
+ if (blkif->irq)
+ return 0;
+
+ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+ return -ENOMEM;
+
+ err = map_frontend_page(blkif, shared_page);
+ if (err) {
+ free_vm_area(blkif->blk_ring_area);
+ return err;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ blkif_sring_t *sring;
+ sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ blkif_x86_32_sring_t *sring_x86_32;
+ sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ blkif_x86_64_sring_t *sring_x86_64;
+ sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+ break;
+ }
+ default:
+ BUG();
+ }
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ blkif->domid, evtchn, tap_blkif_be_int,
+ 0, "blkif-backend", blkif);
+ if (err < 0) {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ return err;
+ }
+ blkif->irq = err;
+
+ return 0;
+}
+
+void tap_blkif_unmap(blkif_t *blkif)
+{
+ if (blkif->irq) {
+ unbind_from_irqhandler(blkif->irq, blkif);
+ blkif->irq = 0;
+ }
+ if (blkif->blk_rings.common.sring) {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ }
+}
+
+void tap_blkif_free(blkif_t *blkif)
+{
+ atomic_dec(&blkif->refcnt);
+ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+
+ tap_blkif_unmap(blkif);
+ kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init tap_blkif_interface_init(void)
+{
+ blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t),
+ 0, 0, NULL);
+}
--- /dev/null
+/* drivers/xen/blktap/xenbus.c
+ *
+ * Xenbus code for blktap
+ *
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Based on the blkback xenbus code:
+ *
+ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <xen/xenbus.h>
+#include "common.h"
+#include "../core/domctl.h"
+
+
+struct backend_info
+{
+ struct xenbus_device *dev;
+ blkif_t *blkif;
+ struct xenbus_watch backend_watch;
+ int xenbus_id;
+ int group_added;
+};
+
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static int blktap_remove(struct xenbus_device *dev);
+static int blktap_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id);
+static void tap_backend_changed(struct xenbus_watch *, const char **,
+ unsigned int);
+static void tap_frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state);
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; str[i]; i++)
+ if (str[i] == c) {
+ if (len == 0)
+ return i;
+ len--;
+ }
+ return (len == 0) ? i : -ERANGE;
+}
+
+static long get_id(const char *str)
+{
+ int len,end;
+ const char *ptr;
+ char *tptr, num[10];
+
+ len = strsep_len(str, '/', 2);
+ end = strlen(str);
+ if ( (len < 0) || (end < 0) ) return -1;
+
+ ptr = str + len + 1;
+ strncpy(num,ptr,end - len);
+ tptr = num + (end - (len + 1));
+ *tptr = '\0';
+ DPRINTK("Get_id called for %s (%s)\n",str,num);
+
+ return simple_strtol(num, NULL, 10);
+}
+
+static int blktap_name(blkif_t *blkif, char *buf)
+{
+ char *devpath, *devname;
+ struct xenbus_device *dev = blkif->be->dev;
+
+ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+ if (IS_ERR(devpath))
+ return PTR_ERR(devpath);
+
+ if ((devname = strstr(devpath, "/dev/")) != NULL)
+ devname += strlen("/dev/");
+ else
+ devname = devpath;
+
+ snprintf(buf, TASK_COMM_LEN, "blktap.%d.%s", blkif->domid, devname);
+ kfree(devpath);
+
+ return 0;
+}
+
+/****************************************************************
+ * sysfs interface for I/O requests of blktap device
+ */
+
+#define VBD_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *_dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ struct xenbus_device *dev = to_xenbus_device(_dev); \
+ struct backend_info *be = dev->dev.driver_data; \
+ \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *tapstat_attrs[] = {
+ &dev_attr_oo_req.attr,
+ &dev_attr_rd_req.attr,
+ &dev_attr_wr_req.attr,
+ &dev_attr_rd_sect.attr,
+ &dev_attr_wr_sect.attr,
+ NULL
+};
+
+static struct attribute_group tapstat_group = {
+ .name = "statistics",
+ .attrs = tapstat_attrs,
+};
+
+int xentap_sysfs_addif(struct xenbus_device *dev)
+{
+ int err;
+ struct backend_info *be = dev->dev.driver_data;
+ err = sysfs_create_group(&dev->dev.kobj, &tapstat_group);
+ if (!err)
+ be->group_added = 1;
+ return err;
+}
+
+void xentap_sysfs_delif(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev->dev.driver_data;
+ sysfs_remove_group(&dev->dev.kobj, &tapstat_group);
+ be->group_added = 0;
+}
+
+static int blktap_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev->dev.driver_data;
+
+ if (be->group_added)
+ xentap_sysfs_delif(be->dev);
+ if (be->backend_watch.node) {
+ unregister_xenbus_watch(&be->backend_watch);
+ kfree(be->backend_watch.node);
+ be->backend_watch.node = NULL;
+ }
+ if (be->blkif) {
+ if (be->blkif->xenblkd)
+ kthread_stop(be->blkif->xenblkd);
+ signal_tapdisk(be->blkif->dev_num);
+ tap_blkif_free(be->blkif);
+ be->blkif = NULL;
+ }
+ kfree(be);
+ dev->dev.driver_data = NULL;
+ return 0;
+}
+
+static void tap_update_blkif_status(blkif_t *blkif)
+{
+ int err;
+ char name[TASK_COMM_LEN];
+
+ /* Not ready to connect? */
+ if(!blkif->irq || !blkif->sectors) {
+ return;
+ }
+
+ /* Already connected? */
+ if (blkif->be->dev->state == XenbusStateConnected)
+ return;
+
+ /* Attempt to connect: exit if we fail to. */
+ connect(blkif->be);
+ if (blkif->be->dev->state != XenbusStateConnected)
+ return;
+
+ err = blktap_name(blkif, name);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "get blktap dev name");
+ return;
+ }
+
+ if (!blkif->be->group_added) {
+ err = xentap_sysfs_addif(blkif->be->dev);
+ if (err) {
+ xenbus_dev_fatal(blkif->be->dev, err,
+ "creating sysfs entries");
+ return;
+ }
+ }
+
+ blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name);
+ if (IS_ERR(blkif->xenblkd)) {
+ err = PTR_ERR(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
+ WPRINTK("Error starting thread\n");
+ }
+}
+
+/**
+ * Entry point to this code when a new device is created. Allocate
+ * the basic structures, and watch the store waiting for the
+ * user-space program to tell us the physical device info. Switch to
+ * InitWait.
+ */
+static int blktap_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+
+ be->dev = dev;
+ dev->dev.driver_data = be;
+ be->xenbus_id = get_id(dev->nodename);
+
+ be->blkif = tap_alloc_blkif(dev->otherend_id);
+ if (IS_ERR(be->blkif)) {
+ err = PTR_ERR(be->blkif);
+ be->blkif = NULL;
+ xenbus_dev_fatal(dev, err, "creating block interface");
+ goto fail;
+ }
+
+ /* setup back pointer */
+ be->blkif->be = be;
+ be->blkif->sectors = 0;
+
+ /* set a watch on disk info, waiting for userspace to update details*/
+ err = xenbus_watch_path2(dev, dev->nodename, "info",
+ &be->backend_watch, tap_backend_changed);
+ if (err)
+ goto fail;
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+ return 0;
+
+fail:
+ DPRINTK("blktap probe failed\n");
+ blktap_remove(dev);
+ return err;
+}
+
+
+/**
+ * Callback received when the user space code has placed the device
+ * information in xenstore.
+ */
+static void tap_backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int err;
+ unsigned long info;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_watch);
+ struct xenbus_device *dev = be->dev;
+
+ /**
+ * Check to see whether userspace code has opened the image
+ * and written sector
+ * and disk info to xenstore
+ */
+ err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info,
+ NULL);
+ if (XENBUS_EXIST_ERR(err))
+ return;
+ if (err) {
+ xenbus_dev_error(dev, err, "getting info");
+ return;
+ }
+
+ DPRINTK("Userspace update on disk info, %lu\n",info);
+
+ err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu",
+ &be->blkif->sectors, NULL);
+
+ /* Associate tap dev with domid*/
+ be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id,
+ be->blkif);
+ DPRINTK("Thread started for domid [%d], connecting disk\n",
+ be->blkif->dev_num);
+
+ tap_update_blkif_status(be->blkif);
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void tap_frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev->dev.driver_data;
+ int err;
+
+ DPRINTK("\n");
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ /* Ensure we connect even when two watches fire in
+ close successsion and we miss the intermediate value
+ of frontend_state. */
+ if (dev->state == XenbusStateConnected)
+ break;
+
+ err = connect_ring(be);
+ if (err)
+ break;
+ tap_update_blkif_status(be->blkif);
+ break;
+
+ case XenbusStateClosing:
+ if (be->blkif->xenblkd) {
+ kthread_stop(be->blkif->xenblkd);
+ be->blkif->xenblkd = NULL;
+ }
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+/**
+ * Switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+ int err;
+
+ struct xenbus_device *dev = be->dev;
+
+ err = xenbus_switch_state(dev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(dev, err, "switching to Connected state",
+ dev->nodename);
+
+ return;
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ unsigned long ring_ref;
+ unsigned int evtchn;
+ char protocol[64];
+ int err;
+
+ DPRINTK("%s\n", dev->otherend);
+
+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
+ &ring_ref, "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+ "%63s", protocol, NULL);
+ if (err) {
+ strcpy(protocol, "unspecified");
+ be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
+ }
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+#if 1 /* maintain compatibility with early sles10-sp1 and paravirt netware betas */
+ else if (0 == strcmp(protocol, "1"))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+ else if (0 == strcmp(protocol, "2"))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+#endif
+ else {
+ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+ return -1;
+ }
+ printk(KERN_INFO
+ "blktap: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+ ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+
+ /* Map the shared frame, irq etc. */
+ err = tap_blkif_map(be->blkif, ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+ ring_ref, evtchn);
+ return err;
+ }
+
+ return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blktap_ids[] = {
+ { "tap" },
+ { "" }
+};
+
+
+static struct xenbus_driver blktap = {
+ .name = "tap",
+ .ids = blktap_ids,
+ .probe = blktap_probe,
+ .remove = blktap_remove,
+ .otherend_changed = tap_frontend_changed
+};
+
+
+void tap_blkif_xenbus_init(void)
+{
+ if (xenbus_register_backend(&blktap))
+ BUG();
+}
--- /dev/null
+obj-$(CONFIG_XEN_DEVMEM) := mem.o
--- /dev/null
+/*
+ * Originally from linux/drivers/char/mem.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Added devfs support.
+ * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
+ * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mman.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/raw.h>
+#include <linux/tty.h>
+#include <linux/capability.h>
+#include <linux/ptrace.h>
+#include <linux/device.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/hypervisor.h>
+
+static inline int uncached_access(struct file *file)
+{
+ if (file->f_flags & O_SYNC)
+ return 1;
+ /* Xen sets correct MTRR type on non-RAM for us. */
+ return 0;
+}
+
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+#ifdef CONFIG_NONPROMISC_DEVMEM
+ u64 from = ((u64)pfn) << PAGE_SHIFT;
+ u64 to = from + size;
+ u64 cursor = from;
+
+ while (cursor < to) {
+ if (!devmem_is_allowed(pfn)) {
+ printk(KERN_INFO
+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+ current->comm, from, to);
+ return 0;
+ }
+ cursor += PAGE_SIZE;
+ pfn++;
+ }
+#endif
+ return 1;
+}
+
+/*
+ * This funcion reads the *physical* memory. The f_pos points directly to the
+ * memory location.
+ */
+static ssize_t read_mem(struct file * file, char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long p = *ppos, ignored;
+ ssize_t read = 0, sz;
+ void __iomem *v;
+
+ while (count > 0) {
+ /*
+ * Handle first page in case it's not aligned
+ */
+ if (-p & (PAGE_SIZE - 1))
+ sz = -p & (PAGE_SIZE - 1);
+ else
+ sz = PAGE_SIZE;
+
+ sz = min_t(unsigned long, sz, count);
+
+ if (!range_is_allowed(p >> PAGE_SHIFT, count))
+ return -EPERM;
+
+ v = ioremap(p, sz);
+ if (IS_ERR(v) || v == NULL) {
+ /*
+ * Some programs (e.g., dmidecode) groove off into
+ * weird RAM areas where no tables can possibly exist
+ * (because Xen will have stomped on them!). These
+ * programs get rather upset if we let them know that
+ * Xen failed their access, so we fake out a read of
+ * all zeroes.
+ */
+ if (clear_user(buf, count))
+ return -EFAULT;
+ read += count;
+ break;
+ }
+
+ ignored = copy_to_user(buf, v, sz);
+ iounmap(v);
+ if (ignored)
+ return -EFAULT;
+ buf += sz;
+ p += sz;
+ count -= sz;
+ read += sz;
+ }
+
+ *ppos += read;
+ return read;
+}
+
+static ssize_t write_mem(struct file * file, const char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long p = *ppos, ignored;
+ ssize_t written = 0, sz;
+ void __iomem *v;
+
+ while (count > 0) {
+ /*
+ * Handle first page in case it's not aligned
+ */
+ if (-p & (PAGE_SIZE - 1))
+ sz = -p & (PAGE_SIZE - 1);
+ else
+ sz = PAGE_SIZE;
+
+ sz = min_t(unsigned long, sz, count);
+
+ if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+ return -EPERM;
+
+ v = ioremap(p, sz);
+ if (v == NULL)
+ break;
+ if (IS_ERR(v)) {
+ if (written == 0)
+ return PTR_ERR(v);
+ break;
+ }
+
+ ignored = copy_from_user(v, buf, sz);
+ iounmap(v);
+ if (ignored) {
+ written += sz - ignored;
+ if (written)
+ break;
+ return -EFAULT;
+ }
+ buf += sz;
+ p += sz;
+ count -= sz;
+ written += sz;
+ }
+
+ *ppos += written;
+ return written;
+}
+
+#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
+static void mmap_mem_open(struct vm_area_struct *vma)
+{
+ map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
+ vma->vm_page_prot);
+}
+
+static void mmap_mem_close(struct vm_area_struct *vma)
+{
+ unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
+ vma->vm_page_prot);
+}
+
+static struct vm_operations_struct mmap_mem_ops = {
+ .open = mmap_mem_open,
+ .close = mmap_mem_close
+};
+
+static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+
+ if (uncached_access(file))
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (!range_is_allowed(vma->vm_pgoff, size))
+ return -EPERM;
+
+ if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
+ &vma->vm_page_prot))
+ return -EINVAL;
+
+ vma->vm_ops = &mmap_mem_ops;
+
+ /* We want to return the real error code, not EAGAIN. */
+ return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+ size, vma->vm_page_prot, DOMID_IO);
+}
+#endif
+
+/*
+ * The memory devices use the full 32/64 bits of the offset, and so we cannot
+ * check against negative addresses: they are ok. The return value is weird,
+ * though, in that case (0).
+ *
+ * also note that seeking relative to the "end of file" isn't supported:
+ * it has no meaning, so it returns -EINVAL.
+ */
+static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
+{
+ loff_t ret;
+
+ mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+ switch (orig) {
+ case 0:
+ file->f_pos = offset;
+ ret = file->f_pos;
+ force_successful_syscall_return();
+ break;
+ case 1:
+ file->f_pos += offset;
+ ret = file->f_pos;
+ force_successful_syscall_return();
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+ return ret;
+}
+
+static int open_mem(struct inode * inode, struct file * filp)
+{
+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+const struct file_operations mem_fops = {
+ .llseek = memory_lseek,
+ .read = read_mem,
+ .write = write_mem,
+ .mmap = xen_mmap_mem,
+ .open = open_mem,
+};
--- /dev/null
+
+obj-$(CONFIG_XEN_CONSOLE) := console.o xencons_ring.o
+obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += dom0.o
--- /dev/null
+/******************************************************************************
+ * console.c
+ *
+ * Virtual console driver.
+ *
+ * Copyright (c) 2002-2004, K A Fraser.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/bootmem.h>
+#include <linux/sysrq.h>
+#include <linux/screen_info.h>
+#include <linux/vt.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/uaccess.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/xencons.h>
+
+/*
+ * Modes:
+ * 'xencons=off' [XC_OFF]: Console is disabled.
+ * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'.
+ * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'.
+ * 'xencons=xvc' [XC_XVC]: Console attached to '/dev/xvc0'.
+ * default: XC_XVC
+ *
+ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
+ * warnings from standard distro startup scripts.
+ */
+static enum {
+ XC_OFF, XC_TTY, XC_SERIAL, XC_XVC
+} xc_mode = XC_XVC;
+static int xc_num = -1;
+
+/* /dev/xvc0 device number allocated by lanana.org. */
+#define XEN_XVC_MAJOR 204
+#define XEN_XVC_MINOR 191
+
+static int __init xencons_setup(char *str)
+{
+ char *q;
+ int n;
+ extern int console_use_vt;
+
+ console_use_vt = 1;
+ if (!strncmp(str, "ttyS", 4)) {
+ xc_mode = XC_SERIAL;
+ str += 4;
+ } else if (!strncmp(str, "tty", 3)) {
+ xc_mode = XC_TTY;
+ str += 3;
+ console_use_vt = 0;
+ } else if (!strncmp(str, "xvc", 3)) {
+ xc_mode = XC_XVC;
+ str += 3;
+ } else if (!strncmp(str, "off", 3)) {
+ xc_mode = XC_OFF;
+ str += 3;
+ }
+
+ n = simple_strtol(str, &q, 10);
+ if (q != str)
+ xc_num = n;
+
+ return 1;
+}
+__setup("xencons=", xencons_setup);
+
+/* The kernel and user-land drivers share a common transmit buffer. */
+static unsigned int wbuf_size = 4096;
+#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
+static char *wbuf;
+static unsigned int wc, wp; /* write_cons, write_prod */
+
+static int __init xencons_bufsz_setup(char *str)
+{
+ unsigned int goal;
+ goal = simple_strtoul(str, NULL, 0);
+ if (goal) {
+ goal = roundup_pow_of_two(goal);
+ if (wbuf_size < goal)
+ wbuf_size = goal;
+ }
+ return 1;
+}
+__setup("xencons_bufsz=", xencons_bufsz_setup);
+
+/* This lock protects accesses to the common transmit buffer. */
+static DEFINE_SPINLOCK(xencons_lock);
+
+/* Common transmit-kick routine. */
+static void __xencons_tx_flush(void);
+
+static struct tty_driver *xencons_driver;
+
+/******************** Kernel console driver ********************************/
+
+static void kcons_write(struct console *c, const char *s, unsigned int count)
+{
+ int i = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+
+ while (i < count) {
+ for (; i < count; i++) {
+ if ((wp - wc) >= (wbuf_size - 1))
+ break;
+ if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
+ wbuf[WBUF_MASK(wp++)] = '\r';
+ }
+
+ __xencons_tx_flush();
+ }
+
+ spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
+{
+
+ while (count > 0) {
+ int rc;
+ rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
+ if (rc <= 0)
+ break;
+ count -= rc;
+ s += rc;
+ }
+}
+
+static struct tty_driver *kcons_device(struct console *c, int *index)
+{
+ *index = 0;
+ return xencons_driver;
+}
+
+static struct console kcons_info = {
+ .device = kcons_device,
+ .flags = CON_PRINTBUFFER | CON_ENABLED,
+ .index = -1,
+};
+
+static int __init xen_console_init(void)
+{
+ if (!is_running_on_xen())
+ goto out;
+
+ if (is_initial_xendomain()) {
+ kcons_info.write = kcons_write_dom0;
+ } else {
+ if (!xen_start_info->console.domU.evtchn)
+ goto out;
+ kcons_info.write = kcons_write;
+ }
+
+ switch (xc_mode) {
+ case XC_XVC:
+ strcpy(kcons_info.name, "xvc");
+ if (xc_num == -1)
+ xc_num = 0;
+ break;
+
+ case XC_SERIAL:
+ strcpy(kcons_info.name, "ttyS");
+ if (xc_num == -1)
+ xc_num = 0;
+ break;
+
+ case XC_TTY:
+ strcpy(kcons_info.name, "tty");
+ if (xc_num == -1)
+ xc_num = 1;
+ break;
+
+ default:
+ goto out;
+ }
+
+ wbuf = alloc_bootmem(wbuf_size);
+
+ register_console(&kcons_info);
+
+ out:
+ return 0;
+}
+console_initcall(xen_console_init);
+
+/*** Useful function for console debugging -- goes straight to Xen. ***/
+asmlinkage int xprintk(const char *fmt, ...)
+{
+ va_list args;
+ int printk_len;
+ static char printk_buf[1024];
+
+ /* Emit the output into the temporary buffer */
+ va_start(args, fmt);
+ printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+ va_end(args);
+
+ /* Send the processed output directly to Xen. */
+ kcons_write_dom0(NULL, printk_buf, printk_len);
+
+ return 0;
+}
+
+/*** Forcibly flush console data before dying. ***/
+void xencons_force_flush(void)
+{
+ int sz;
+
+ /* Emergency console is synchronous, so there's nothing to flush. */
+ if (!is_running_on_xen() ||
+ is_initial_xendomain() ||
+ !xen_start_info->console.domU.evtchn)
+ return;
+
+ /* Spin until console data is flushed through to the daemon. */
+ while (wc != wp) {
+ int sent = 0;
+ if ((sz = wp - wc) == 0)
+ continue;
+ sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+ if (sent > 0)
+ wc += sent;
+ }
+}
+
+
+/******************** User-space console driver (/dev/console) ************/
+
+#define DRV(_d) (_d)
+#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \
+ ((_tty)->index != (xc_num - 1)))
+
+static struct ktermios *xencons_termios[MAX_NR_CONSOLES];
+static struct ktermios *xencons_termios_locked[MAX_NR_CONSOLES];
+static struct tty_struct *xencons_tty;
+static int xencons_priv_irq;
+static char x_char;
+
+void xencons_rx(char *buf, unsigned len)
+{
+ int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+ if (xencons_tty == NULL)
+ goto out;
+
+ for (i = 0; i < len; i++) {
+#ifdef CONFIG_MAGIC_SYSRQ
+ if (sysrq_on()) {
+ static unsigned long sysrq_requested;
+
+ if (buf[i] == '\x0f') { /* ^O */
+ if (!sysrq_requested) {
+ sysrq_requested = jiffies;
+ continue; /* don't print sysrq key */
+ }
+ sysrq_requested = 0;
+ } else if (sysrq_requested) {
+ unsigned long sysrq_timeout =
+ sysrq_requested + HZ*2;
+ sysrq_requested = 0;
+ if (time_before(jiffies, sysrq_timeout)) {
+ spin_unlock_irqrestore(
+ &xencons_lock, flags);
+ handle_sysrq(buf[i], xencons_tty);
+ spin_lock_irqsave(
+ &xencons_lock, flags);
+ continue;
+ }
+ }
+ }
+#endif
+ tty_insert_flip_char(xencons_tty, buf[i], 0);
+ }
+ tty_flip_buffer_push(xencons_tty);
+
+ out:
+ spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void __xencons_tx_flush(void)
+{
+ int sent, sz, work_done = 0;
+
+ if (x_char) {
+ if (is_initial_xendomain())
+ kcons_write_dom0(NULL, &x_char, 1);
+ else
+ while (x_char)
+ if (xencons_ring_send(&x_char, 1) == 1)
+ break;
+ x_char = 0;
+ work_done = 1;
+ }
+
+ while (wc != wp) {
+ sz = wp - wc;
+ if (sz > (wbuf_size - WBUF_MASK(wc)))
+ sz = wbuf_size - WBUF_MASK(wc);
+ if (is_initial_xendomain()) {
+ kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
+ wc += sz;
+ } else {
+ sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+ if (sent == 0)
+ break;
+ wc += sent;
+ }
+ work_done = 1;
+ }
+
+ if (work_done && (xencons_tty != NULL)) {
+ wake_up_interruptible(&xencons_tty->write_wait);
+ if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
+ (xencons_tty->ldisc.write_wakeup != NULL))
+ (xencons_tty->ldisc.write_wakeup)(xencons_tty);
+ }
+}
+
+void xencons_tx(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+ __xencons_tx_flush();
+ spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+/* Privileged receive callback and transmit kicker. */
+static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
+{
+ static char rbuf[16];
+ int l;
+
+ while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
+ xencons_rx(rbuf, l);
+
+ xencons_tx();
+
+ return IRQ_HANDLED;
+}
+
+static int xencons_write_room(struct tty_struct *tty)
+{
+ return wbuf_size - (wp - wc);
+}
+
+static int xencons_chars_in_buffer(struct tty_struct *tty)
+{
+ return wp - wc;
+}
+
+static void xencons_send_xchar(struct tty_struct *tty, char ch)
+{
+ unsigned long flags;
+
+ if (DUMMY_TTY(tty))
+ return;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+ x_char = ch;
+ __xencons_tx_flush();
+ spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_throttle(struct tty_struct *tty)
+{
+ if (DUMMY_TTY(tty))
+ return;
+
+ if (I_IXOFF(tty))
+ xencons_send_xchar(tty, STOP_CHAR(tty));
+}
+
+static void xencons_unthrottle(struct tty_struct *tty)
+{
+ if (DUMMY_TTY(tty))
+ return;
+
+ if (I_IXOFF(tty)) {
+ if (x_char != 0)
+ x_char = 0;
+ else
+ xencons_send_xchar(tty, START_CHAR(tty));
+ }
+}
+
+static void xencons_flush_buffer(struct tty_struct *tty)
+{
+ unsigned long flags;
+
+ if (DUMMY_TTY(tty))
+ return;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+ wc = wp = 0;
+ spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static inline int __xencons_put_char(int ch)
+{
+ char _ch = (char)ch;
+ if ((wp - wc) == wbuf_size)
+ return 0;
+ wbuf[WBUF_MASK(wp++)] = _ch;
+ return 1;
+}
+
+static int xencons_write(
+ struct tty_struct *tty,
+ const unsigned char *buf,
+ int count)
+{
+ int i;
+ unsigned long flags;
+
+ if (DUMMY_TTY(tty))
+ return count;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+
+ for (i = 0; i < count; i++)
+ if (!__xencons_put_char(buf[i]))
+ break;
+
+ if (i != 0)
+ __xencons_tx_flush();
+
+ spin_unlock_irqrestore(&xencons_lock, flags);
+
+ return i;
+}
+
+static int xencons_put_char(struct tty_struct *tty, u_char ch)
+{
+ unsigned long flags;
+ int ret;
+
+ if (DUMMY_TTY(tty))
+ return 0;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+ ret = __xencons_put_char(ch);
+ spin_unlock_irqrestore(&xencons_lock, flags);
+ return ret;
+}
+
+static void xencons_flush_chars(struct tty_struct *tty)
+{
+ unsigned long flags;
+
+ if (DUMMY_TTY(tty))
+ return;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+ __xencons_tx_flush();
+ spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+ unsigned long orig_jiffies = jiffies;
+
+ if (DUMMY_TTY(tty))
+ return;
+
+ while (tty_chars_in_buffer(tty)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ if (signal_pending(current))
+ break;
+ if (timeout && time_after(jiffies, orig_jiffies + timeout))
+ break;
+ }
+
+ set_current_state(TASK_RUNNING);
+}
+
+static int xencons_open(struct tty_struct *tty, struct file *filp)
+{
+ unsigned long flags;
+
+ if (DUMMY_TTY(tty))
+ return 0;
+
+ spin_lock_irqsave(&xencons_lock, flags);
+ tty->driver_data = NULL;
+ if (xencons_tty == NULL)
+ xencons_tty = tty;
+ __xencons_tx_flush();
+ spin_unlock_irqrestore(&xencons_lock, flags);
+
+ return 0;
+}
+
+static void xencons_close(struct tty_struct *tty, struct file *filp)
+{
+ unsigned long flags;
+
+ if (DUMMY_TTY(tty))
+ return;
+
+ mutex_lock(&tty_mutex);
+
+ if (tty->count != 1) {
+ mutex_unlock(&tty_mutex);
+ return;
+ }
+
+ /* Prevent other threads from re-opening this tty. */
+ set_bit(TTY_CLOSING, &tty->flags);
+ mutex_unlock(&tty_mutex);
+
+ tty->closing = 1;
+ tty_wait_until_sent(tty, 0);
+ tty_driver_flush_buffer(tty);
+ if (tty->ldisc.flush_buffer != NULL)
+ tty->ldisc.flush_buffer(tty);
+ tty->closing = 0;
+ spin_lock_irqsave(&xencons_lock, flags);
+ xencons_tty = NULL;
+ spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static const struct tty_operations xencons_ops = {
+ .open = xencons_open,
+ .close = xencons_close,
+ .write = xencons_write,
+ .write_room = xencons_write_room,
+ .put_char = xencons_put_char,
+ .flush_chars = xencons_flush_chars,
+ .chars_in_buffer = xencons_chars_in_buffer,
+ .send_xchar = xencons_send_xchar,
+ .flush_buffer = xencons_flush_buffer,
+ .throttle = xencons_throttle,
+ .unthrottle = xencons_unthrottle,
+ .wait_until_sent = xencons_wait_until_sent,
+};
+
+static int __init xencons_init(void)
+{
+ int rc;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ if (xc_mode == XC_OFF)
+ return 0;
+
+ if (!is_initial_xendomain()) {
+ rc = xencons_ring_init();
+ if (rc)
+ return rc;
+ }
+
+ xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ?
+ MAX_NR_CONSOLES : 1);
+ if (xencons_driver == NULL)
+ return -ENOMEM;
+
+ DRV(xencons_driver)->name = "xencons";
+ DRV(xencons_driver)->major = TTY_MAJOR;
+ DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL;
+ DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL;
+ DRV(xencons_driver)->init_termios = tty_std_termios;
+ DRV(xencons_driver)->flags =
+ TTY_DRIVER_REAL_RAW |
+ TTY_DRIVER_RESET_TERMIOS;
+ DRV(xencons_driver)->termios = xencons_termios;
+ DRV(xencons_driver)->termios_locked = xencons_termios_locked;
+
+ switch (xc_mode) {
+ case XC_XVC:
+ DRV(xencons_driver)->name = "xvc";
+ DRV(xencons_driver)->major = XEN_XVC_MAJOR;
+ DRV(xencons_driver)->minor_start = XEN_XVC_MINOR;
+ DRV(xencons_driver)->name_base = xc_num;
+ break;
+ case XC_SERIAL:
+ DRV(xencons_driver)->name = "ttyS";
+ DRV(xencons_driver)->minor_start = 64 + xc_num;
+ DRV(xencons_driver)->name_base = xc_num;
+ break;
+ default:
+ DRV(xencons_driver)->name = "tty";
+ DRV(xencons_driver)->minor_start = 1;
+ DRV(xencons_driver)->name_base = 1;
+ break;
+ }
+
+ tty_set_operations(xencons_driver, &xencons_ops);
+
+ if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
+ printk("WARNING: Failed to register Xen virtual "
+ "console driver as '%s%d'\n",
+ DRV(xencons_driver)->name,
+ DRV(xencons_driver)->name_base);
+ put_tty_driver(xencons_driver);
+ xencons_driver = NULL;
+ return rc;
+ }
+
+ if (is_initial_xendomain()) {
+ xencons_priv_irq = bind_virq_to_irqhandler(
+ VIRQ_CONSOLE,
+ 0,
+ xencons_priv_interrupt,
+ 0,
+ "console",
+ NULL);
+ BUG_ON(xencons_priv_irq < 0);
+ }
+
+ printk("Xen virtual console successfully installed as %s%d\n",
+ DRV(xencons_driver)->name, xc_num);
+
+ return 0;
+}
+
+module_init(xencons_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/******************************************************************************
+ * dom0.c
+ *
+ * Dom0 console parameter initialization.
+ *
+ * Copyright (c) 2002-2004, K A Fraser.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/screen_info.h>
+#include <linux/init.h>
+#include <xen/interface/xen.h>
+#include <xen/xencons.h>
+
+void __init dom0_init_screen_info(const struct dom0_vga_console_info *info, size_t size)
+{
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info.orig_video_mode = 3;
+ screen_info.orig_video_isVGA = 1;
+ screen_info.orig_video_lines = 25;
+ screen_info.orig_video_cols = 80;
+ screen_info.orig_video_ega_bx = 3;
+ screen_info.orig_video_points = 16;
+ screen_info.orig_y = screen_info.orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info.orig_video_lines = info->u.text_mode_3.rows;
+ screen_info.orig_video_cols = info->u.text_mode_3.columns;
+ screen_info.orig_x = info->u.text_mode_3.cursor_x;
+ screen_info.orig_y = info->u.text_mode_3.cursor_y;
+ screen_info.orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info.lfb_width = info->u.vesa_lfb.width;
+ screen_info.lfb_height = info->u.vesa_lfb.height;
+ screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info.lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info.lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info.red_size = info->u.vesa_lfb.red_size;
+ screen_info.red_pos = info->u.vesa_lfb.red_pos;
+ screen_info.green_size = info->u.vesa_lfb.green_size;
+ screen_info.green_pos = info->u.vesa_lfb.green_pos;
+ screen_info.blue_size = info->u.vesa_lfb.blue_size;
+ screen_info.blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info.capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info.vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/xencons.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <xen/interface/io/console.h>
+
+static int xencons_irq;
+
+static inline struct xencons_interface *xencons_interface(void)
+{
+ return mfn_to_virt(xen_start_info->console.domU.mfn);
+}
+
+static inline void notify_daemon(void)
+{
+ /* Use evtchn: this is called early, before irq is set up. */
+ notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
+}
+
+int xencons_ring_send(const char *data, unsigned len)
+{
+ int sent = 0;
+ struct xencons_interface *intf = xencons_interface();
+ XENCONS_RING_IDX cons, prod;
+
+ cons = intf->out_cons;
+ prod = intf->out_prod;
+ mb();
+ BUG_ON((prod - cons) > sizeof(intf->out));
+
+ while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
+ intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
+
+ wmb();
+ intf->out_prod = prod;
+
+ notify_daemon();
+
+ return sent;
+}
+
+static irqreturn_t handle_input(int irq, void *unused)
+{
+ struct xencons_interface *intf = xencons_interface();
+ XENCONS_RING_IDX cons, prod;
+
+ cons = intf->in_cons;
+ prod = intf->in_prod;
+ mb();
+ BUG_ON((prod - cons) > sizeof(intf->in));
+
+ while (cons != prod) {
+ xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
+ cons++;
+ }
+
+ mb();
+ intf->in_cons = cons;
+
+ notify_daemon();
+
+ xencons_tx();
+
+ return IRQ_HANDLED;
+}
+
+int xencons_ring_init(void)
+{
+ int irq;
+
+ if (xencons_irq)
+ unbind_from_irqhandler(xencons_irq, NULL);
+ xencons_irq = 0;
+
+ if (!is_running_on_xen() ||
+ is_initial_xendomain() ||
+ !xen_start_info->console.domU.evtchn)
+ return -ENODEV;
+
+ irq = bind_caller_port_to_irqhandler(
+ xen_start_info->console.domU.evtchn,
+ handle_input, 0, "xencons", NULL);
+ if (irq < 0) {
+ printk(KERN_ERR "XEN console request irq failed %i\n", irq);
+ return irq;
+ }
+
+ xencons_irq = irq;
+
+ /* In case we have in-flight data after save/restore... */
+ notify_daemon();
+
+ return 0;
+}
+
+void xencons_resume(void)
+{
+ (void)xencons_ring_init();
+}
--- /dev/null
+#
+# Makefile for the linux kernel.
+#
+
+obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o firmware.o domctl.o
+
+obj-$(CONFIG_PROC_FS) += xen_proc.o
+obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor_sysfs.o
+obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
+obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
+obj-$(CONFIG_KEXEC) += machine_kexec.o
+obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
--- /dev/null
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/xenbus.h>
+
+/*
+ * Set of CPUs that remote admin software will allow us to bring online.
+ * Notified to us via xenbus.
+ */
+static cpumask_t xenbus_allowed_cpumask;
+
+/* Set of CPUs that local admin will allow us to bring online. */
+static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
+
+static int local_cpu_hotplug_request(void)
+{
+ /*
+ * We assume a CPU hotplug request comes from local admin if it is made
+ * via a userspace process (i.e., one with a real mm_struct).
+ */
+ return (current->mm != NULL);
+}
+
+static void vcpu_hotplug(unsigned int cpu)
+{
+ int err;
+ char dir[32], state[32];
+
+ if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
+ return;
+
+ sprintf(dir, "cpu/%u", cpu);
+ err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
+ if (err != 1) {
+ printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
+ return;
+ }
+
+ if (strcmp(state, "online") == 0) {
+ cpu_set(cpu, xenbus_allowed_cpumask);
+ (void)cpu_up(cpu);
+ } else if (strcmp(state, "offline") == 0) {
+ cpu_clear(cpu, xenbus_allowed_cpumask);
+ (void)cpu_down(cpu);
+ } else {
+ printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
+ state, cpu);
+ }
+}
+
+static void handle_vcpu_hotplug_event(
+ struct xenbus_watch *watch, const char **vec, unsigned int len)
+{
+ unsigned int cpu;
+ char *cpustr;
+ const char *node = vec[XS_WATCH_PATH];
+
+ if ((cpustr = strstr(node, "cpu/")) != NULL) {
+ sscanf(cpustr, "cpu/%u", &cpu);
+ vcpu_hotplug(cpu);
+ }
+}
+
+static int smpboot_cpu_notify(struct notifier_block *notifier,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ /*
+ * We do this in a callback notifier rather than __cpu_disable()
+ * because local_cpu_hotplug_request() does not work in the latter
+ * as it's always executed from within a stopmachine kthread.
+ */
+ if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
+ cpu_clear(cpu, local_allowed_cpumask);
+
+ return NOTIFY_OK;
+}
+
+static int setup_cpu_watcher(struct notifier_block *notifier,
+ unsigned long event, void *data)
+{
+ unsigned int i;
+
+ static struct xenbus_watch cpu_watch = {
+ .node = "cpu",
+ .callback = handle_vcpu_hotplug_event,
+ .flags = XBWF_new_thread };
+ (void)register_xenbus_watch(&cpu_watch);
+
+ if (!is_initial_xendomain()) {
+ for_each_possible_cpu(i)
+ vcpu_hotplug(i);
+ printk(KERN_INFO "Brought up %ld CPUs\n",
+ (long)num_online_cpus());
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int __init setup_vcpu_hotplug_event(void)
+{
+ static struct notifier_block hotplug_cpu = {
+ .notifier_call = smpboot_cpu_notify };
+ static struct notifier_block xsn_cpu = {
+ .notifier_call = setup_cpu_watcher };
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ register_cpu_notifier(&hotplug_cpu);
+ register_xenstore_notifier(&xsn_cpu);
+
+ return 0;
+}
+
+arch_initcall(setup_vcpu_hotplug_event);
+
+int smp_suspend(void)
+{
+ unsigned int cpu;
+ int err;
+
+ for_each_online_cpu(cpu) {
+ if (cpu == 0)
+ continue;
+ err = cpu_down(cpu);
+ if (err) {
+ printk(KERN_CRIT "Failed to take all CPUs "
+ "down: %d.\n", err);
+ for_each_possible_cpu(cpu)
+ vcpu_hotplug(cpu);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+void smp_resume(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ vcpu_hotplug(cpu);
+}
+
+int cpu_up_check(unsigned int cpu)
+{
+ int rc = 0;
+
+ if (local_cpu_hotplug_request()) {
+ cpu_set(cpu, local_allowed_cpumask);
+ if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
+ printk("%s: attempt to bring up CPU %u disallowed by "
+ "remote admin.\n", __FUNCTION__, cpu);
+ rc = -EBUSY;
+ }
+ } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
+ !cpu_isset(cpu, xenbus_allowed_cpumask)) {
+ rc = -EBUSY;
+ }
+
+ return rc;
+}
+
+void init_xenbus_allowed_cpumask(void)
+{
+ xenbus_allowed_cpumask = cpu_present_map;
+}
--- /dev/null
+/*
+ * !!! dirty hack alert !!!
+ *
+ * Problem: old guests kernels don't have a "protocol" node
+ * in the frontend xenstore directory, so mixing
+ * 32 and 64bit domains doesn't work.
+ *
+ * Upstream plans to solve this in the tools, by letting them
+ * create a protocol node. Which certainly makes sense.
+ * But it isn't trivial and isn't done yet. Too bad.
+ *
+ * So for the time being we use the get_address_size domctl
+ * hypercall for a pretty good guess. Not nice as the domctl
+ * hypercall isn't supposed to be used by the kernel. Because
+ * we don't want to have dependencies between dom0 kernel and
+ * xen kernel versions. Now we have one. Ouch.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+
+#include "domctl.h"
+
+/* stuff copied from xen/interface/domctl.h, which we can't
+ * include directly for the reasons outlined above .... */
+
+#define XEN_DOMCTL_get_address_size 36
+typedef struct xen_domctl_address_size {
+ uint32_t size;
+} xen_domctl_address_size_t;
+
+typedef __attribute__((aligned(8))) uint64_t uint64_aligned_t;
+
+union xen_domctl {
+ /* v4: sles10 sp1: xen 3.0.4 + 32-on-64 patches */
+ struct {
+ uint32_t cmd;
+ uint32_t interface_version;
+ domid_t domain;
+ union {
+ /* left out lots of other struct xen_domctl_foobar */
+ struct xen_domctl_address_size address_size;
+ uint64_t dummy_align;
+ uint8_t dummy_pad[128];
+ } u;
+ } v4;
+
+ /* v5: upstream: xen 3.1 */
+ struct {
+ uint32_t cmd;
+ uint32_t interface_version;
+ domid_t domain;
+ union {
+ struct xen_domctl_address_size address_size;
+ uint64_aligned_t dummy_align;
+ uint8_t dummy_pad[128];
+ } u;
+ } v5;
+};
+
+/* The actual code comes here */
+
+static inline int hypervisor_domctl(void *domctl)
+{
+ return _hypercall1(int, domctl, domctl);
+}
+
+int xen_guest_address_size(int domid)
+{
+ union xen_domctl domctl;
+ int low, ret;
+
+#define guest_address_size(ver) do { \
+ memset(&domctl, 0, sizeof(domctl)); \
+ domctl.v##ver.cmd = XEN_DOMCTL_get_address_size; \
+ domctl.v##ver.interface_version = low = ver; \
+ domctl.v##ver.domain = domid; \
+ ret = hypervisor_domctl(&domctl) ?: domctl.v##ver.u.address_size.size; \
+ if (ret == 32 || ret == 64) { \
+ printk("v" #ver " domctl worked ok: dom%d is %d-bit\n", \
+ domid, ret); \
+ return ret; \
+ } \
+} while (0)
+
+ guest_address_size(5);
+#if CONFIG_XEN_COMPAT < 0x030100
+ guest_address_size(4);
+#endif
+
+ ret = BITS_PER_LONG;
+ printk("v%d...5 domctls failed, assuming dom%d is native: %d\n",
+ low, domid, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xen_guest_address_size);
+
+int xen_guest_blkif_protocol(int domid)
+{
+ int address_size = xen_guest_address_size(domid);
+
+ if (address_size == BITS_PER_LONG)
+ return BLKIF_PROTOCOL_NATIVE;
+ if (address_size == 32)
+ return BLKIF_PROTOCOL_X86_32;
+ if (address_size == 64)
+ return BLKIF_PROTOCOL_X86_64;
+ return BLKIF_PROTOCOL_NATIVE;
+}
+EXPORT_SYMBOL_GPL(xen_guest_blkif_protocol);
--- /dev/null
+int xen_guest_address_size(int domid);
+int xen_guest_blkif_protocol(int domid);
--- /dev/null
+/******************************************************************************
+ * evtchn.c
+ *
+ * Communication via Xen event channels.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/version.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/ptrace.h>
+#include <asm/synch_bitops.h>
+#include <xen/evtchn.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/physdev.h>
+#include <asm/hypervisor.h>
+#include <linux/mc146818rtc.h> /* RTC_IRQ */
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> event-channel mappings. */
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+ [0 ... NR_EVENT_CHANNELS-1] = -1 };
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+static u32 irq_info[NR_IRQS];
+
+/* Binding types. */
+enum {
+ IRQT_UNBOUND,
+ IRQT_PIRQ,
+ IRQT_VIRQ,
+ IRQT_IPI,
+ IRQT_LOCAL_PORT,
+ IRQT_CALLER_PORT
+};
+
+/* Constructor for packed IRQ information. */
+static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+ return ((type << 24) | (index << 16) | evtchn);
+}
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+/*
+ * Accessors for packed IRQ information.
+ */
+
+static inline unsigned int evtchn_from_irq(int irq)
+{
+ return (u16)(irq_info[irq]);
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+ return (u8)(irq_info[irq] >> 16);
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+ return (u8)(irq_info[irq] >> 24);
+}
+
+/* IRQ <-> VIRQ mapping. */
+DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping. */
+#ifndef NR_IPIS
+#define NR_IPIS 1
+#endif
+DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS);
+
+#ifdef CONFIG_SMP
+
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+
+static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
+ unsigned int idx)
+{
+ return (sh->evtchn_pending[idx] &
+ cpu_evtchn_mask[cpu][idx] &
+ ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ int irq = evtchn_to_irq[chn];
+
+ BUG_ON(!test_bit(chn, s->evtchn_mask));
+
+ if (irq != -1)
+ irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+
+ clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
+ set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
+ cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+ int i;
+
+ /* By default all event channels notify CPU#0. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_desc[i].affinity = cpumask_of_cpu(0);
+
+ memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+ memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+ return cpu_evtchn[evtchn];
+}
+
+#else
+
+static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
+ unsigned int idx)
+{
+ return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+ return 0;
+}
+
+#endif
+
+/* Upcall to generic IRQ layer. */
+#ifdef CONFIG_X86
+extern unsigned int do_IRQ(struct pt_regs *regs);
+void __init xen_init_IRQ(void);
+void __init init_IRQ(void)
+{
+ irq_ctx_init(0);
+ xen_init_IRQ();
+}
+#if defined (__i386__)
+static inline void exit_idle(void) {}
+#elif defined (__x86_64__)
+#include <asm/idle.h>
+#endif
+#define do_IRQ(irq, regs) do { \
+ (regs)->orig_ax = ~(irq); \
+ do_IRQ((regs)); \
+} while (0)
+#elif defined (__powerpc__)
+#define do_IRQ(irq, regs) __do_IRQ(irq, regs)
+static inline void exit_idle(void) {}
+#endif
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn) ((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+ VOID(HYPERVISOR_xen_version(0, NULL));
+}
+/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
+EXPORT_SYMBOL(force_evtchn_callback);
+
+static DEFINE_PER_CPU(unsigned int, upcall_count) = { 0 };
+static DEFINE_PER_CPU(unsigned int, last_processed_l1i) = { BITS_PER_LONG - 1 };
+static DEFINE_PER_CPU(unsigned int, last_processed_l2i) = { BITS_PER_LONG - 1 };
+
+/* NB. Interrupts are disabled on entry. */
+asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
+{
+ unsigned long l1, l2;
+ unsigned long masked_l1, masked_l2;
+ unsigned int l1i, l2i, port, count;
+ int irq;
+ unsigned int cpu = smp_processor_id();
+ shared_info_t *s = HYPERVISOR_shared_info;
+ vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
+
+
+ do {
+ /* Avoid a callback storm when we reenable delivery. */
+ vcpu_info->evtchn_upcall_pending = 0;
+
+ /* Nested invocations bail immediately. */
+ if (unlikely(per_cpu(upcall_count, cpu)++))
+ return;
+
+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+ /* Clear master flag /before/ clearing selector flag. */
+ rmb();
+#endif
+ l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
+
+ l1i = per_cpu(last_processed_l1i, cpu);
+ l2i = per_cpu(last_processed_l2i, cpu);
+
+ while (l1 != 0) {
+
+ l1i = (l1i + 1) % BITS_PER_LONG;
+ masked_l1 = l1 & ((~0UL) << l1i);
+
+ if (masked_l1 == 0) { /* if we masked out all events, wrap around to the beginning */
+ l1i = BITS_PER_LONG - 1;
+ l2i = BITS_PER_LONG - 1;
+ continue;
+ }
+ l1i = __ffs(masked_l1);
+
+ do {
+ l2 = active_evtchns(cpu, s, l1i);
+
+ l2i = (l2i + 1) % BITS_PER_LONG;
+ masked_l2 = l2 & ((~0UL) << l2i);
+
+ if (masked_l2 == 0) { /* if we masked out all events, move on */
+ l2i = BITS_PER_LONG - 1;
+ break;
+ }
+
+ l2i = __ffs(masked_l2);
+
+ /* process port */
+ port = (l1i * BITS_PER_LONG) + l2i;
+ if ((irq = evtchn_to_irq[port]) != -1)
+ do_IRQ(irq, regs);
+ else {
+ exit_idle();
+ evtchn_device_upcall(port);
+ }
+
+ /* if this is the final port processed, we'll pick up here+1 next time */
+ per_cpu(last_processed_l1i, cpu) = l1i;
+ per_cpu(last_processed_l2i, cpu) = l2i;
+
+ } while (l2i != BITS_PER_LONG - 1);
+
+ l2 = active_evtchns(cpu, s, l1i);
+ if (l2 == 0) /* we handled all ports, so we can clear the selector bit */
+ l1 &= ~(1UL << l1i);
+
+ }
+
+ /* If there were nested callbacks then we have more to do. */
+ count = per_cpu(upcall_count, cpu);
+ per_cpu(upcall_count, cpu) = 0;
+ } while (unlikely(count != 1));
+}
+
+static int find_unbound_irq(void)
+{
+ static int warned;
+ int dynirq, irq;
+
+ for (dynirq = 0; dynirq < NR_DYNIRQS; dynirq++) {
+ irq = dynirq_to_irq(dynirq);
+ if (irq_bindcount[irq] == 0)
+ return irq;
+ }
+
+ if (!warned) {
+ warned = 1;
+ printk(KERN_WARNING "No available IRQ to bind to: "
+ "increase NR_DYNIRQS.\n");
+ }
+
+ return -ENOSPC;
+}
+
+static int bind_caller_port_to_irq(unsigned int caller_port)
+{
+ int irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if ((irq = evtchn_to_irq[caller_port]) == -1) {
+ if ((irq = find_unbound_irq()) < 0)
+ goto out;
+
+ evtchn_to_irq[caller_port] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port);
+ }
+
+ irq_bindcount[irq]++;
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+static int bind_local_port_to_irq(unsigned int local_port)
+{
+ int irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ BUG_ON(evtchn_to_irq[local_port] != -1);
+
+ if ((irq = find_unbound_irq()) < 0) {
+ struct evtchn_close close = { .port = local_port };
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
+ BUG();
+ goto out;
+ }
+
+ evtchn_to_irq[local_port] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
+ irq_bindcount[irq]++;
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+static int bind_listening_port_to_irq(unsigned int remote_domain)
+{
+ struct evtchn_alloc_unbound alloc_unbound;
+ int err;
+
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = remote_domain;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+
+ return err ? : bind_local_port_to_irq(alloc_unbound.port);
+}
+
+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+ unsigned int remote_port)
+{
+ struct evtchn_bind_interdomain bind_interdomain;
+ int err;
+
+ bind_interdomain.remote_dom = remote_domain;
+ bind_interdomain.remote_port = remote_port;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+
+ return err ? : bind_local_port_to_irq(bind_interdomain.local_port);
+}
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+ if ((irq = find_unbound_irq()) < 0)
+ goto out;
+
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+ per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
+ if ((irq = find_unbound_irq()) < 0)
+ goto out;
+
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+ per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+ struct evtchn_close close;
+ unsigned int cpu;
+ int evtchn = evtchn_from_irq(irq);
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
+ close.port = evtchn;
+ if ((type_from_irq(irq) != IRQT_CALLER_PORT) &&
+ HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
+ BUG();
+
+ switch (type_from_irq(irq)) {
+ case IRQT_VIRQ:
+ per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+ [index_from_irq(irq)] = -1;
+ break;
+ case IRQT_IPI:
+ per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
+ [index_from_irq(irq)] = -1;
+ break;
+ default:
+ break;
+ }
+
+ /* Closed ports are implicitly re-bound to VCPU0. */
+ bind_evtchn_to_cpu(evtchn, 0);
+
+ evtchn_to_irq[evtchn] = -1;
+ irq_info[irq] = IRQ_UNBOUND;
+
+ /* Zap stats across IRQ changes of use. */
+ for_each_possible_cpu(cpu)
+ kstat_cpu(cpu).irqs[irq] = 0;
+ }
+
+ spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_caller_port_to_irqhandler(
+ unsigned int caller_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_caller_port_to_irq(caller_port);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_caller_port_to_irqhandler);
+
+int bind_listening_port_to_irqhandler(
+ unsigned int remote_domain,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_listening_port_to_irq(remote_domain);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_listening_port_to_irqhandler);
+
+int bind_interdomain_evtchn_to_irqhandler(
+ unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(
+ unsigned int virq,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_virq_to_irq(virq, cpu);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(
+ unsigned int ipi,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_ipi_to_irq(ipi, cpu);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+ free_irq(irq, dev_id);
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+#ifdef CONFIG_SMP
+void rebind_evtchn_to_cpu(int port, unsigned int cpu)
+{
+ struct evtchn_bind_vcpu ebv = { .port = port, .vcpu = cpu };
+ int masked;
+
+ masked = test_and_set_evtchn_mask(port);
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &ebv) == 0)
+ bind_evtchn_to_cpu(port, cpu);
+ if (!masked)
+ unmask_evtchn(port);
+}
+
+static void rebind_irq_to_cpu(unsigned int irq, unsigned int tcpu)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ rebind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+static void set_affinity_irq(unsigned int irq, cpumask_t dest)
+{
+ unsigned tcpu = first_cpu(dest);
+ rebind_irq_to_cpu(irq, tcpu);
+}
+#endif
+
+int resend_irq_on_evtchn(unsigned int irq)
+{
+ int masked, evtchn = evtchn_from_irq(irq);
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ if (!VALID_EVTCHN(evtchn))
+ return 1;
+
+ masked = test_and_set_evtchn_mask(evtchn);
+ synch_set_bit(evtchn, s->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
+
+ return 1;
+}
+
+/*
+ * Interface to generic handling in irq.c
+ */
+
+static unsigned int startup_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+ return 0;
+}
+
+static void unmask_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+}
+
+static void mask_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn)) {
+ mask_evtchn(evtchn);
+ clear_evtchn(evtchn);
+ }
+}
+
+static void end_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
+ unmask_evtchn(evtchn);
+}
+
+static struct irq_chip dynirq_chip = {
+ .name = "Dynamic-irq",
+ .startup = startup_dynirq,
+ .mask = mask_dynirq,
+ .unmask = unmask_dynirq,
+ .mask_ack = ack_dynirq,
+ .ack = ack_dynirq,
+ .end = end_dynirq,
+#ifdef CONFIG_SMP
+ .set_affinity = set_affinity_irq,
+#endif
+ .retrigger = resend_irq_on_evtchn,
+};
+
+static inline void pirq_unmask_notify(int pirq)
+{
+ struct physdev_eoi eoi = { .irq = pirq };
+ if (unlikely(test_bit(pirq, pirq_needs_eoi)))
+ VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi));
+}
+
+static inline void pirq_query_unmask(int pirq)
+{
+ struct physdev_irq_status_query irq_status;
+ irq_status.irq = pirq;
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ irq_status.flags = 0;
+ clear_bit(pirq, pirq_needs_eoi);
+ if (irq_status.flags & XENIRQSTAT_needs_eoi)
+ set_bit(pirq, pirq_needs_eoi);
+}
+
+/*
+ * On startup, if there is no action associated with the IRQ then we are
+ * probing. In this case we should not share with others as it will confuse us.
+ */
+#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
+
+static unsigned int startup_pirq(unsigned int irq)
+{
+ struct evtchn_bind_pirq bind_pirq;
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ goto out;
+
+ bind_pirq.pirq = irq;
+ /* NB. We are happy to share unless we are probing. */
+ bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
+ if (!probing_irq(irq))
+ printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
+ irq);
+ return 0;
+ }
+ evtchn = bind_pirq.port;
+
+ pirq_query_unmask(irq_to_pirq(irq));
+
+ evtchn_to_irq[evtchn] = irq;
+ bind_evtchn_to_cpu(evtchn, 0);
+ irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
+
+ out:
+ unmask_evtchn(evtchn);
+ pirq_unmask_notify(irq_to_pirq(irq));
+
+ return 0;
+}
+
+static void shutdown_pirq(unsigned int irq)
+{
+ struct evtchn_close close;
+ int evtchn = evtchn_from_irq(irq);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ mask_evtchn(evtchn);
+
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ bind_evtchn_to_cpu(evtchn, 0);
+ evtchn_to_irq[evtchn] = -1;
+ irq_info[irq] = IRQ_UNBOUND;
+}
+
+static void unmask_pirq(unsigned int irq)
+{
+ startup_pirq(irq);
+}
+
+static void mask_pirq(unsigned int irq)
+{
+}
+
+static void ack_pirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn)) {
+ mask_evtchn(evtchn);
+ clear_evtchn(evtchn);
+ }
+}
+
+static void end_pirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if ((irq_desc[irq].status & (IRQ_DISABLED|IRQ_PENDING)) ==
+ (IRQ_DISABLED|IRQ_PENDING)) {
+ shutdown_pirq(irq);
+ } else if (VALID_EVTCHN(evtchn)) {
+ unmask_evtchn(evtchn);
+ pirq_unmask_notify(irq_to_pirq(irq));
+ }
+}
+
+static struct irq_chip pirq_chip = {
+ .name = "Phys-irq",
+ .typename = "Phys-irq",
+ .startup = startup_pirq,
+ .shutdown = shutdown_pirq,
+ .mask = mask_pirq,
+ .unmask = unmask_pirq,
+ .mask_ack = ack_pirq,
+ .ack = ack_pirq,
+ .end = end_pirq,
+#ifdef CONFIG_SMP
+ .set_affinity = set_affinity_irq,
+#endif
+ .retrigger = resend_irq_on_evtchn,
+};
+
+int irq_ignore_unhandled(unsigned int irq)
+{
+ struct physdev_irq_status_query irq_status = { .irq = irq };
+
+ if (!is_running_on_xen())
+ return 0;
+
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ return 0;
+ return !!(irq_status.flags & XENIRQSTAT_shared);
+}
+
+void notify_remote_via_irq(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+int irq_to_evtchn_port(int irq)
+{
+ return evtchn_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(irq_to_evtchn_port);
+
+void mask_evtchn(int port)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ synch_set_bit(port, s->evtchn_mask);
+}
+EXPORT_SYMBOL_GPL(mask_evtchn);
+
+void unmask_evtchn(int port)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ unsigned int cpu = smp_processor_id();
+ vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
+
+ BUG_ON(!irqs_disabled());
+
+ /* Slow path (hypercall) if this is a non-local port. */
+ if (unlikely(cpu != cpu_from_evtchn(port))) {
+ struct evtchn_unmask unmask = { .port = port };
+ VOID(HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask));
+ return;
+ }
+
+ synch_clear_bit(port, s->evtchn_mask);
+
+ /* Did we miss an interrupt 'edge'? Re-fire if so. */
+ if (synch_test_bit(port, s->evtchn_pending) &&
+ !synch_test_and_set_bit(port / BITS_PER_LONG,
+ &vcpu_info->evtchn_pending_sel))
+ vcpu_info->evtchn_upcall_pending = 1;
+}
+EXPORT_SYMBOL_GPL(unmask_evtchn);
+
+void disable_all_local_evtchn(void)
+{
+ unsigned i, cpu = smp_processor_id();
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ for (i = 0; i < NR_EVENT_CHANNELS; ++i)
+ if (cpu_from_evtchn(i) == cpu)
+ synch_set_bit(i, &s->evtchn_mask[0]);
+}
+
+static void restore_cpu_virqs(unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int virq, irq, evtchn;
+
+ for (virq = 0; virq < NR_VIRQS; virq++) {
+ if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
+ continue;
+
+ BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
+
+ /* Get a new binding from Xen. */
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ /* Record the new mapping. */
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+ bind_evtchn_to_cpu(evtchn, cpu);
+
+ /* Ready for use. */
+ unmask_evtchn(evtchn);
+ }
+}
+
+static void restore_cpu_ipis(unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int ipi, irq, evtchn;
+
+ for (ipi = 0; ipi < NR_IPIS; ipi++) {
+ if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+ continue;
+
+ BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
+
+ /* Get a new binding from Xen. */
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ /* Record the new mapping. */
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+ bind_evtchn_to_cpu(evtchn, cpu);
+
+ /* Ready for use. */
+ unmask_evtchn(evtchn);
+
+ }
+}
+
+void irq_resume(void)
+{
+ unsigned int cpu, pirq, irq, evtchn;
+
+ init_evtchn_cpu_bindings();
+
+ /* New event-channel space is not 'live' yet. */
+ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+ mask_evtchn(evtchn);
+
+ /* Check that no PIRQs are still bound. */
+ for (pirq = 0; pirq < NR_PIRQS; pirq++)
+ BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
+
+ /* No IRQ <-> event-channel mappings. */
+ for (irq = 0; irq < NR_IRQS; irq++)
+ irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
+ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+ evtchn_to_irq[evtchn] = -1;
+
+ for_each_possible_cpu(cpu) {
+ restore_cpu_virqs(cpu);
+ restore_cpu_ipis(cpu);
+ }
+
+}
+
+void __init xen_init_IRQ(void)
+{
+ unsigned int i;
+
+ init_evtchn_cpu_bindings();
+
+ /* No event channels are 'live' right now. */
+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ mask_evtchn(i);
+
+ /* No IRQ -> event-channel mappings. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_info[i] = IRQ_UNBOUND;
+
+ /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+ for (i = 0; i < NR_DYNIRQS; i++) {
+ irq_bindcount[dynirq_to_irq(i)] = 0;
+
+ irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED;
+ irq_desc[dynirq_to_irq(i)].action = NULL;
+ irq_desc[dynirq_to_irq(i)].depth = 1;
+ set_irq_chip_and_handler_name(dynirq_to_irq(i), &dynirq_chip,
+ handle_level_irq, "level");
+ }
+
+ /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
+ for (i = 0; i < NR_PIRQS; i++) {
+ irq_bindcount[pirq_to_irq(i)] = 1;
+
+#ifdef RTC_IRQ
+ /* If not domain 0, force our RTC driver to fail its probe. */
+ if ((i == RTC_IRQ) && !is_initial_xendomain())
+ continue;
+#endif
+
+ irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED;
+ irq_desc[pirq_to_irq(i)].action = NULL;
+ irq_desc[pirq_to_irq(i)].depth = 1;
+ set_irq_chip_and_handler_name(pirq_to_irq(i), &pirq_chip,
+ handle_level_irq, "level");
+ }
+}
--- /dev/null
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+#include <xen/features.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
+EXPORT_SYMBOL(xen_features);
+
+void xen_setup_features(void)
+{
+ xen_feature_info_t fi;
+ int i, j;
+
+ for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+ fi.submap_idx = i;
+ if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+ break;
+ for (j=0; j<32; j++)
+ xen_features[i*32+j] = !!(fi.submap & 1<<j);
+ }
+}
--- /dev/null
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <video/edid.h>
+#include <xen/interface/platform.h>
+#include <asm/hypervisor.h>
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+void __init copy_edd(void)
+{
+ int ret;
+ struct xen_platform_op op;
+
+ if (!is_initial_xendomain())
+ return;
+
+ op.cmd = XENPF_firmware_info;
+
+ op.u.firmware_info.type = XEN_FW_DISK_INFO;
+ for (op.u.firmware_info.index = 0;
+ edd.edd_info_nr < EDDMAXNR;
+ op.u.firmware_info.index++) {
+ struct edd_info *info = edd.edd_info + edd.edd_info_nr;
+
+ info->params.length = sizeof(info->params);
+ set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
+ &info->params);
+ ret = HYPERVISOR_platform_op(&op);
+ if (ret)
+ break;
+
+#define C(x) info->x = op.u.firmware_info.u.disk_info.x
+ C(device);
+ C(version);
+ C(interface_support);
+ C(legacy_max_cylinder);
+ C(legacy_max_head);
+ C(legacy_sectors_per_track);
+#undef C
+
+ edd.edd_info_nr++;
+ }
+
+ op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
+ for (op.u.firmware_info.index = 0;
+ edd.mbr_signature_nr < EDD_MBR_SIG_MAX;
+ op.u.firmware_info.index++) {
+ ret = HYPERVISOR_platform_op(&op);
+ if (ret)
+ break;
+ edd.mbr_signature[edd.mbr_signature_nr++] =
+ op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
+ }
+}
+#endif
+
+void __init copy_edid(void)
+{
+#if defined(CONFIG_FIRMWARE_EDID) && defined(CONFIG_X86)
+ struct xen_platform_op op;
+
+ if (!is_initial_xendomain())
+ return;
+
+ op.cmd = XENPF_firmware_info;
+ op.u.firmware_info.index = 0;
+ op.u.firmware_info.type = XEN_FW_VBEDDC_INFO;
+ set_xen_guest_handle(op.u.firmware_info.u.vbeddc_info.edid,
+ edid_info.dummy);
+ if (HYPERVISOR_platform_op(&op) != 0)
+ memset(edid_info.dummy, 0x13, sizeof(edid_info.dummy));
+#endif
+}
--- /dev/null
+/******************************************************************************
+ * gnttab.c
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/seqlock.h>
+#include <xen/interface/xen.h>
+#include <xen/gnttab.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/synch_bitops.h>
+#include <asm/io.h>
+#include <xen/interface/memory.h>
+#include <xen/driver_util.h>
+#include <asm/gnttab_dma.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+/* External tools reserve first few grant table entries. */
+#define NR_RESERVED_ENTRIES 8
+#define GNTTAB_LIST_END 0xffffffff
+#define ENTRIES_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t))
+
+static grant_ref_t **gnttab_list;
+static unsigned int nr_grant_frames;
+static unsigned int boot_max_nr_grant_frames;
+static int gnttab_free_count;
+static grant_ref_t gnttab_free_head;
+static DEFINE_SPINLOCK(gnttab_list_lock);
+
+static struct grant_entry *shared;
+
+static struct gnttab_free_callback *gnttab_free_callback_list;
+
+static int gnttab_expand(unsigned int req_entries);
+
+#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define gnttab_entry(entry) (gnttab_list[(entry) / RPP][(entry) % RPP])
+
+#define nr_freelist_frames(grant_frames) \
+ (((grant_frames) * ENTRIES_PER_GRANT_FRAME + RPP - 1) / RPP)
+
+static int get_free_entries(int count)
+{
+ unsigned long flags;
+ int ref, rc;
+ grant_ref_t head;
+
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+
+ if ((gnttab_free_count < count) &&
+ ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ return rc;
+ }
+
+ ref = head = gnttab_free_head;
+ gnttab_free_count -= count;
+ while (count-- > 1)
+ head = gnttab_entry(head);
+ gnttab_free_head = gnttab_entry(head);
+ gnttab_entry(head) = GNTTAB_LIST_END;
+
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+
+ return ref;
+}
+
+#define get_free_entry() get_free_entries(1)
+
+static void do_free_callbacks(void)
+{
+ struct gnttab_free_callback *callback, *next;
+
+ callback = gnttab_free_callback_list;
+ gnttab_free_callback_list = NULL;
+
+ while (callback != NULL) {
+ next = callback->next;
+ if (gnttab_free_count >= callback->count) {
+ callback->next = NULL;
+ callback->fn(callback->arg);
+ } else {
+ callback->next = gnttab_free_callback_list;
+ gnttab_free_callback_list = callback;
+ }
+ callback = next;
+ }
+}
+
+static inline void check_free_callbacks(void)
+{
+ if (unlikely(gnttab_free_callback_list))
+ do_free_callbacks();
+}
+
+static void put_free_entry(grant_ref_t ref)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ gnttab_entry(ref) = gnttab_free_head;
+ gnttab_free_head = ref;
+ gnttab_free_count++;
+ check_free_callbacks();
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+/*
+ * Public grant-issuing interface functions
+ */
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+ int flags)
+{
+ int ref;
+
+ if (unlikely((ref = get_free_entry()) < 0))
+ return -ENOSPC;
+
+ shared[ref].frame = frame;
+ shared[ref].domid = domid;
+ wmb();
+ BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
+ shared[ref].flags = GTF_permit_access | flags;
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags)
+{
+ shared[ref].frame = frame;
+ shared[ref].domid = domid;
+ wmb();
+ BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
+ shared[ref].flags = GTF_permit_access | flags;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
+
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+ u16 nflags;
+
+ nflags = shared[ref].flags;
+
+ return (nflags & (GTF_reading|GTF_writing));
+}
+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref)
+{
+ u16 flags, nflags;
+
+ nflags = shared[ref].flags;
+ do {
+ if ((flags = nflags) & (GTF_reading|GTF_writing)) {
+ printk(KERN_DEBUG "WARNING: g.e. still in use!\n");
+ return 0;
+ }
+ } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) !=
+ flags);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page)
+{
+ if (gnttab_end_foreign_access_ref(ref)) {
+ put_free_entry(ref);
+ if (page != 0)
+ free_page(page);
+ } else {
+ /* XXX This needs to be fixed so that the ref and page are
+ placed on a list to be freed up later. */
+ printk(KERN_DEBUG
+ "WARNING: leaking g.e. and page still in use!\n");
+ }
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+{
+ int ref;
+
+ if (unlikely((ref = get_free_entry()) < 0))
+ return -ENOSPC;
+ gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
+ unsigned long pfn)
+{
+ shared[ref].frame = pfn;
+ shared[ref].domid = domid;
+ wmb();
+ shared[ref].flags = GTF_accept_transfer;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+ unsigned long frame;
+ u16 flags;
+
+ /*
+ * If a transfer is not even yet started, try to reclaim the grant
+ * reference and return failure (== 0).
+ */
+ while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
+ if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags)
+ return 0;
+ cpu_relax();
+ }
+
+ /* If a transfer is in progress then wait until it is completed. */
+ while (!(flags & GTF_transfer_completed)) {
+ flags = shared[ref].flags;
+ cpu_relax();
+ }
+
+ /* Read the frame number /after/ reading completion status. */
+ rmb();
+ frame = shared[ref].frame;
+ BUG_ON(frame == 0);
+
+ return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
+{
+ unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
+ put_free_entry(ref);
+ return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
+
+void gnttab_free_grant_reference(grant_ref_t ref)
+{
+ put_free_entry(ref);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
+
+void gnttab_free_grant_references(grant_ref_t head)
+{
+ grant_ref_t ref;
+ unsigned long flags;
+ int count = 1;
+ if (head == GNTTAB_LIST_END)
+ return;
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ ref = head;
+ while (gnttab_entry(ref) != GNTTAB_LIST_END) {
+ ref = gnttab_entry(ref);
+ count++;
+ }
+ gnttab_entry(ref) = gnttab_free_head;
+ gnttab_free_head = head;
+ gnttab_free_count += count;
+ check_free_callbacks();
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
+
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
+{
+ int h = get_free_entries(count);
+
+ if (h < 0)
+ return -ENOSPC;
+
+ *head = h;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
+
+int gnttab_empty_grant_references(const grant_ref_t *private_head)
+{
+ return (*private_head == GNTTAB_LIST_END);
+}
+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
+
+int gnttab_claim_grant_reference(grant_ref_t *private_head)
+{
+ grant_ref_t g = *private_head;
+ if (unlikely(g == GNTTAB_LIST_END))
+ return -ENOSPC;
+ *private_head = gnttab_entry(g);
+ return g;
+}
+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+ grant_ref_t release)
+{
+ gnttab_entry(release) = *private_head;
+ *private_head = release;
+}
+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+ void (*fn)(void *), void *arg, u16 count)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ if (callback->next)
+ goto out;
+ callback->fn = fn;
+ callback->arg = arg;
+ callback->count = count;
+ callback->next = gnttab_free_callback_list;
+ gnttab_free_callback_list = callback;
+ check_free_callbacks();
+out:
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
+
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
+{
+ struct gnttab_free_callback **pcb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
+ if (*pcb == callback) {
+ *pcb = callback->next;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
+
+static int grow_gnttab_list(unsigned int more_frames)
+{
+ unsigned int new_nr_grant_frames, extra_entries, i;
+ unsigned int nr_glist_frames, new_nr_glist_frames;
+
+ new_nr_grant_frames = nr_grant_frames + more_frames;
+ extra_entries = more_frames * ENTRIES_PER_GRANT_FRAME;
+
+ nr_glist_frames = nr_freelist_frames(nr_grant_frames);
+ new_nr_glist_frames = nr_freelist_frames(new_nr_grant_frames);
+ for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
+ gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
+ if (!gnttab_list[i])
+ goto grow_nomem;
+ }
+
+ for (i = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
+ i < ENTRIES_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
+ gnttab_entry(i) = i + 1;
+
+ gnttab_entry(i) = gnttab_free_head;
+ gnttab_free_head = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
+ gnttab_free_count += extra_entries;
+
+ nr_grant_frames = new_nr_grant_frames;
+
+ check_free_callbacks();
+
+ return 0;
+
+grow_nomem:
+ for ( ; i >= nr_glist_frames; i--)
+ free_page((unsigned long) gnttab_list[i]);
+ return -ENOMEM;
+}
+
+static unsigned int __max_nr_grant_frames(void)
+{
+ struct gnttab_query_size query;
+ int rc;
+
+ query.dom = DOMID_SELF;
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
+ if ((rc < 0) || (query.status != GNTST_okay))
+ return 4; /* Legacy max supported number of frames */
+
+ return query.max_nr_frames;
+}
+
+static inline unsigned int max_nr_grant_frames(void)
+{
+ unsigned int xen_max = __max_nr_grant_frames();
+
+ if (xen_max > boot_max_nr_grant_frames)
+ return boot_max_nr_grant_frames;
+ return xen_max;
+}
+
+#ifdef CONFIG_XEN
+
+static DEFINE_SEQLOCK(gnttab_dma_lock);
+
+#ifdef CONFIG_X86
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ unsigned long **frames = (unsigned long **)data;
+
+ set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
+ (*frames)++;
+ return 0;
+}
+
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+
+ set_pte_at(&init_mm, addr, pte, __pte(0));
+ return 0;
+}
+
+void *arch_gnttab_alloc_shared(unsigned long *frames)
+{
+ struct vm_struct *area;
+ area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
+ BUG_ON(area == NULL);
+ return area->addr;
+}
+#endif /* CONFIG_X86 */
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+ struct gnttab_setup_table setup;
+ unsigned long *frames;
+ unsigned int nr_gframes = end_idx + 1;
+ int rc;
+
+ frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
+ if (!frames)
+ return -ENOMEM;
+
+ setup.dom = DOMID_SELF;
+ setup.nr_frames = nr_gframes;
+ set_xen_guest_handle(setup.frame_list, frames);
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
+ if (rc == -ENOSYS) {
+ kfree(frames);
+ return -ENOSYS;
+ }
+
+ BUG_ON(rc || setup.status);
+
+ if (shared == NULL)
+ shared = arch_gnttab_alloc_shared(frames);
+
+#ifdef CONFIG_X86
+ rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+ PAGE_SIZE * nr_gframes,
+ map_pte_fn, &frames);
+ BUG_ON(rc);
+ frames -= nr_gframes; /* adjust after map_pte_fn() */
+#endif /* CONFIG_X86 */
+
+ kfree(frames);
+
+ return 0;
+}
+
+static void gnttab_page_free(struct page *page)
+{
+ ClearPageForeign(page);
+ gnttab_reset_grant_page(page);
+ put_page(page);
+}
+
+/*
+ * Must not be called with IRQs off. This should only be used on the
+ * slow path.
+ *
+ * Copy a foreign granted page to local memory.
+ */
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
+{
+ struct gnttab_unmap_and_replace unmap;
+ mmu_update_t mmu;
+ struct page *page;
+ struct page *new_page;
+ void *new_addr;
+ void *addr;
+ paddr_t pfn;
+ maddr_t mfn;
+ maddr_t new_mfn;
+ int err;
+
+ page = *pagep;
+ if (!get_page_unless_zero(page))
+ return -ENOENT;
+
+ err = -ENOMEM;
+ new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+ if (!new_page)
+ goto out;
+
+ new_addr = page_address(new_page);
+ addr = page_address(page);
+ memcpy(new_addr, addr, PAGE_SIZE);
+
+ pfn = page_to_pfn(page);
+ mfn = pfn_to_mfn(pfn);
+ new_mfn = virt_to_mfn(new_addr);
+
+ write_seqlock(&gnttab_dma_lock);
+
+ /* Make seq visible before checking page_mapped. */
+ smp_mb();
+
+ /* Has the page been DMA-mapped? */
+ if (unlikely(page_mapped(page))) {
+ write_sequnlock(&gnttab_dma_lock);
+ put_page(new_page);
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ set_phys_to_machine(pfn, new_mfn);
+
+ gnttab_set_replace_op(&unmap, (unsigned long)addr,
+ (unsigned long)new_addr, ref);
+
+ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+ &unmap, 1);
+ BUG_ON(err);
+ BUG_ON(unmap.status);
+
+ write_sequnlock(&gnttab_dma_lock);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
+
+ mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+ mmu.val = pfn;
+ err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
+ BUG_ON(err);
+ }
+
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+ set_bit(PG_foreign, &new_page->flags);
+ *pagep = new_page;
+
+ SetPageForeign(page, gnttab_page_free);
+ page->mapping = NULL;
+
+out:
+ put_page(page);
+ return err;
+}
+EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
+
+void gnttab_reset_grant_page(struct page *page)
+{
+ init_page_count(page);
+ reset_page_mapcount(page);
+}
+EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
+
+/*
+ * Keep track of foreign pages marked as PageForeign so that we don't
+ * return them to the remote domain prematurely.
+ *
+ * PageForeign pages are pinned down by increasing their mapcount.
+ *
+ * All other pages are simply returned as is.
+ */
+void __gnttab_dma_map_page(struct page *page)
+{
+ unsigned int seq;
+
+ if (!is_running_on_xen() || !PageForeign(page))
+ return;
+
+ do {
+ seq = read_seqbegin(&gnttab_dma_lock);
+
+ if (gnttab_dma_local_pfn(page))
+ break;
+
+ atomic_set(&page->_mapcount, 0);
+
+ /* Make _mapcount visible before read_seqretry. */
+ smp_mb();
+ } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
+}
+
+int gnttab_resume(void)
+{
+ if (max_nr_grant_frames() < nr_grant_frames)
+ return -ENOSYS;
+ return gnttab_map(0, nr_grant_frames - 1);
+}
+
+int gnttab_suspend(void)
+{
+#ifdef CONFIG_X86
+ apply_to_page_range(&init_mm, (unsigned long)shared,
+ PAGE_SIZE * nr_grant_frames,
+ unmap_pte_fn, NULL);
+#endif
+ return 0;
+}
+
+#else /* !CONFIG_XEN */
+
+#include <platform-pci.h>
+
+static unsigned long resume_frames;
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+ struct xen_add_to_physmap xatp;
+ unsigned int i = end_idx;
+
+ /* Loop backwards, so that the first hypercall has the largest index,
+ * ensuring that the table will grow only once.
+ */
+ do {
+ xatp.domid = DOMID_SELF;
+ xatp.idx = i;
+ xatp.space = XENMAPSPACE_grant_table;
+ xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ BUG();
+ } while (i-- > start_idx);
+
+ return 0;
+}
+
+int gnttab_resume(void)
+{
+ unsigned int max_nr_gframes, nr_gframes;
+
+ nr_gframes = nr_grant_frames;
+ max_nr_gframes = max_nr_grant_frames();
+ if (max_nr_gframes < nr_gframes)
+ return -ENOSYS;
+
+ if (!resume_frames) {
+ resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+ shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
+ if (shared == NULL) {
+ printk("error to ioremap gnttab share frames\n");
+ return -1;
+ }
+ }
+
+ gnttab_map(0, nr_gframes - 1);
+
+ return 0;
+}
+
+#endif /* !CONFIG_XEN */
+
+static int gnttab_expand(unsigned int req_entries)
+{
+ int rc;
+ unsigned int cur, extra;
+
+ cur = nr_grant_frames;
+ extra = ((req_entries + (ENTRIES_PER_GRANT_FRAME-1)) /
+ ENTRIES_PER_GRANT_FRAME);
+ if (cur + extra > max_nr_grant_frames())
+ return -ENOSPC;
+
+ if ((rc = gnttab_map(cur, cur + extra - 1)) == 0)
+ rc = grow_gnttab_list(extra);
+
+ return rc;
+}
+
+int __devinit gnttab_init(void)
+{
+ int i;
+ unsigned int max_nr_glist_frames, nr_glist_frames;
+ unsigned int nr_init_grefs;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ nr_grant_frames = 1;
+ boot_max_nr_grant_frames = __max_nr_grant_frames();
+
+ /* Determine the maximum number of frames required for the
+ * grant reference free list on the current hypervisor.
+ */
+ max_nr_glist_frames = nr_freelist_frames(boot_max_nr_grant_frames);
+
+ gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
+ GFP_KERNEL);
+ if (gnttab_list == NULL)
+ return -ENOMEM;
+
+ nr_glist_frames = nr_freelist_frames(nr_grant_frames);
+ for (i = 0; i < nr_glist_frames; i++) {
+ gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
+ if (gnttab_list[i] == NULL)
+ goto ini_nomem;
+ }
+
+ if (gnttab_resume() < 0)
+ return -ENODEV;
+
+ nr_init_grefs = nr_grant_frames * ENTRIES_PER_GRANT_FRAME;
+
+ for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
+ gnttab_entry(i) = i + 1;
+
+ gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
+ gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
+ gnttab_free_head = NR_RESERVED_ENTRIES;
+
+ return 0;
+
+ ini_nomem:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)gnttab_list[i]);
+ kfree(gnttab_list);
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_XEN
+core_initcall(gnttab_init);
+#endif
--- /dev/null
+/*
+ * copyright (c) 2006 IBM Corporation
+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <xen/hypervisor_sysfs.h>
+#include <asm/hypervisor.h>
+
+static ssize_t hyp_sysfs_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buffer)
+{
+ struct hyp_sysfs_attr *hyp_attr;
+ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+ if (hyp_attr->show)
+ return hyp_attr->show(hyp_attr, buffer);
+ return 0;
+}
+
+static ssize_t hyp_sysfs_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer,
+ size_t len)
+{
+ struct hyp_sysfs_attr *hyp_attr;
+ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+ if (hyp_attr->store)
+ return hyp_attr->store(hyp_attr, buffer, len);
+ return 0;
+}
+
+static struct sysfs_ops hyp_sysfs_ops = {
+ .show = hyp_sysfs_show,
+ .store = hyp_sysfs_store,
+};
+
+static struct kobj_type hyp_sysfs_kobj_type = {
+ .sysfs_ops = &hyp_sysfs_ops,
+};
+
+static int __init hypervisor_subsys_init(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+ return 0;
+}
+
+device_initcall(hypervisor_subsys_init);
--- /dev/null
+/*
+ * drivers/xen/core/machine_kexec.c
+ * handle transition of Linux booting another kernel
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,
+ struct kimage *image);
+extern int machine_kexec_setup_resources(struct resource *hypervisor,
+ struct resource *phys_cpus,
+ int nr_phys_cpus);
+extern void machine_kexec_register_resources(struct resource *res);
+
+static int __initdata xen_max_nr_phys_cpus;
+static struct resource xen_hypervisor_res;
+static struct resource *xen_phys_cpus;
+
+void __init xen_machine_kexec_setup_resources(void)
+{
+ xen_kexec_range_t range;
+ struct resource *res;
+ int k = 0;
+
+ if (strstr(boot_command_line, "crashkernel="))
+ printk(KERN_WARNING "Ignoring crashkernel command line, "
+ "parameter will be supplied by xen\n");
+
+ if (!is_initial_xendomain())
+ return;
+
+ /* determine maximum number of physical cpus */
+
+ while (1) {
+ memset(&range, 0, sizeof(range));
+ range.range = KEXEC_RANGE_MA_CPU;
+ range.nr = k;
+
+ if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+ break;
+
+ k++;
+ }
+
+ if (k == 0)
+ return;
+
+ xen_max_nr_phys_cpus = k;
+
+ /* allocate xen_phys_cpus */
+
+ xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
+ BUG_ON(xen_phys_cpus == NULL);
+
+ /* fill in xen_phys_cpus with per-cpu crash note information */
+
+ for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+ memset(&range, 0, sizeof(range));
+ range.range = KEXEC_RANGE_MA_CPU;
+ range.nr = k;
+
+ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+ goto err;
+
+ res = xen_phys_cpus + k;
+
+ memset(res, 0, sizeof(*res));
+ res->name = "Crash note";
+ res->start = range.start;
+ res->end = range.start + range.size - 1;
+ res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ }
+
+ /* fill in xen_hypervisor_res with hypervisor machine address range */
+
+ memset(&range, 0, sizeof(range));
+ range.range = KEXEC_RANGE_MA_XEN;
+
+ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+ goto err;
+
+ xen_hypervisor_res.name = "Hypervisor code and data";
+ xen_hypervisor_res.start = range.start;
+ xen_hypervisor_res.end = range.start + range.size - 1;
+ xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+#ifdef CONFIG_X86_64
+ insert_resource(iomem_resource, &xen_hypervisor_res);
+#endif
+
+ /* fill in crashk_res if range is reserved by hypervisor */
+
+ memset(&range, 0, sizeof(range));
+ range.range = KEXEC_RANGE_MA_CRASH;
+
+ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+ goto err;
+
+ if (range.size) {
+ crashk_res.start = range.start;
+ crashk_res.end = range.start + range.size - 1;
+ }
+
+ if (machine_kexec_setup_resources(&xen_hypervisor_res, xen_phys_cpus,
+ xen_max_nr_phys_cpus))
+ goto err;
+
+ return;
+
+ err:
+ /*
+ * It isn't possible to free xen_phys_cpus this early in the
+ * boot. Failure at this stage is unexpected and the amount of
+ * memory is small therefore we tolerate the potential leak.
+ */
+ xen_max_nr_phys_cpus = 0;
+ return;
+}
+
+#ifndef CONFIG_X86_64
+void __init xen_machine_kexec_register_resources(struct resource *res)
+{
+ request_resource(res, &xen_hypervisor_res);
+ machine_kexec_register_resources(res);
+}
+#endif
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+ machine_kexec_setup_load_arg(xki, image);
+
+ xki->indirection_page = image->head;
+ xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+ xen_kexec_load_t xkl;
+
+ memset(&xkl, 0, sizeof(xkl));
+ xkl.type = image->type;
+ setup_load_arg(&xkl.image, image);
+ return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+ xen_kexec_load_t xkl;
+
+ memset(&xkl, 0, sizeof(xkl));
+ xkl.type = image->type;
+ WARN_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl));
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU,
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+ xen_kexec_exec_t xke;
+
+ memset(&xke, 0, sizeof(xke));
+ xke.type = image->type;
+ VOID(HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke));
+ panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+void machine_shutdown(void)
+{
+ /* do nothing */
+}
+
+
+/*
+ * Local variables:
+ * c-file-style: "linux"
+ * indent-tabs-mode: t
+ * c-indent-level: 8
+ * c-basic-offset: 8
+ * tab-width: 8
+ * End:
+ */
--- /dev/null
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <linux/stringify.h>
+#include <linux/stop_machine.h>
+#include <asm/irq.h>
+#include <asm/mmu_context.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <linux/cpu.h>
+#include <xen/gnttab.h>
+#include <xen/xencons.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/interface/vcpu.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+void machine_emergency_restart(void)
+{
+ /* We really want to get pending console data out before we die. */
+ xencons_force_flush();
+ HYPERVISOR_shutdown(SHUTDOWN_reboot);
+}
+
+void machine_restart(char * __unused)
+{
+ machine_emergency_restart();
+}
+
+void machine_halt(void)
+{
+ machine_power_off();
+}
+
+void machine_power_off(void)
+{
+ /* We really want to get pending console data out before we die. */
+ xencons_force_flush();
+ if (pm_power_off)
+ pm_power_off();
+ HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+}
+
+int reboot_thru_bios = 0; /* for dmi_scan.c */
+EXPORT_SYMBOL(machine_restart);
+EXPORT_SYMBOL(machine_halt);
+EXPORT_SYMBOL(machine_power_off);
+
+static void pre_suspend(void)
+{
+ HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+ WARN_ON(HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
+ __pte_ma(0), 0));
+
+ xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn =
+ mfn_to_pfn(xen_start_info->console.domU.mfn);
+}
+
+static void post_suspend(int suspend_cancelled)
+{
+ int i, j, k, fpp;
+ unsigned long shinfo_mfn;
+ extern unsigned long max_pfn;
+ extern unsigned long *pfn_to_mfn_frame_list_list;
+ extern unsigned long *pfn_to_mfn_frame_list[];
+
+ if (suspend_cancelled) {
+ xen_start_info->store_mfn =
+ pfn_to_mfn(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn =
+ pfn_to_mfn(xen_start_info->console.domU.mfn);
+ } else {
+#ifdef CONFIG_SMP
+ cpu_initialized_map = cpu_online_map;
+#endif
+ }
+
+ shinfo_mfn = xen_start_info->shared_info >> PAGE_SHIFT;
+ if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
+ pfn_pte_ma(shinfo_mfn, PAGE_KERNEL),
+ 0))
+ BUG();
+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+
+ memset(empty_zero_page, 0, PAGE_SIZE);
+
+ fpp = PAGE_SIZE/sizeof(unsigned long);
+ for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+ if ((j % fpp) == 0) {
+ k++;
+ pfn_to_mfn_frame_list_list[k] =
+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
+ j = 0;
+ }
+ pfn_to_mfn_frame_list[k][j] =
+ virt_to_mfn(&phys_to_machine_mapping[i]);
+ }
+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(pfn_to_mfn_frame_list_list);
+}
+
+#else /* !(defined(__i386__) || defined(__x86_64__)) */
+
+#ifndef HAVE_XEN_PRE_SUSPEND
+#define xen_pre_suspend() ((void)0)
+#endif
+
+#ifndef HAVE_XEN_POST_SUSPEND
+#define xen_post_suspend(x) ((void)0)
+#endif
+
+#define switch_idle_mm() ((void)0)
+#define mm_pin_all() ((void)0)
+#define pre_suspend() xen_pre_suspend()
+#define post_suspend(x) xen_post_suspend(x)
+
+#endif
+
+struct suspend {
+ int fast_suspend;
+ void (*resume_notifier)(void);
+};
+
+static int take_machine_down(void *_suspend)
+{
+ struct suspend *suspend = _suspend;
+ int suspend_cancelled, err;
+ extern void time_resume(void);
+
+ if (suspend->fast_suspend) {
+ BUG_ON(!irqs_disabled());
+ } else {
+ BUG_ON(irqs_disabled());
+
+ for (;;) {
+ err = smp_suspend();
+ if (err)
+ return err;
+
+ xenbus_suspend();
+ preempt_disable();
+
+ if (num_online_cpus() == 1)
+ break;
+
+ preempt_enable();
+ xenbus_suspend_cancel();
+ }
+
+ local_irq_disable();
+ }
+
+ mm_pin_all();
+ gnttab_suspend();
+ pre_suspend();
+
+ /*
+ * This hypercall returns 1 if suspend was cancelled or the domain was
+ * merely checkpointed, and 0 if it is resuming in a new domain.
+ */
+ suspend_cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+
+ suspend->resume_notifier();
+ post_suspend(suspend_cancelled);
+ gnttab_resume();
+ if (!suspend_cancelled) {
+ irq_resume();
+#ifdef __x86_64__
+ /*
+ * Older versions of Xen do not save/restore the user %cr3.
+ * We do it here just in case, but there's no need if we are
+ * in fast-suspend mode as that implies a new enough Xen.
+ */
+ if (!suspend->fast_suspend) {
+ struct mmuext_op op;
+ op.cmd = MMUEXT_NEW_USER_BASEPTR;
+ op.arg1.mfn = pfn_to_mfn(__pa(__user_pgd(
+ current->active_mm->pgd)) >> PAGE_SHIFT);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+ }
+#endif
+ }
+ time_resume();
+
+ if (!suspend->fast_suspend)
+ local_irq_enable();
+
+ return suspend_cancelled;
+}
+
+int __xen_suspend(int fast_suspend, void (*resume_notifier)(void))
+{
+ int err, suspend_cancelled;
+ struct suspend suspend;
+
+ BUG_ON(smp_processor_id() != 0);
+ BUG_ON(in_interrupt());
+
+#if defined(__i386__) || defined(__x86_64__)
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ printk(KERN_WARNING "Cannot suspend in "
+ "auto_translated_physmap mode.\n");
+ return -EOPNOTSUPP;
+ }
+#endif
+
+ /* If we are definitely UP then 'slow mode' is actually faster. */
+ if (num_possible_cpus() == 1)
+ fast_suspend = 0;
+
+ suspend.fast_suspend = fast_suspend;
+ suspend.resume_notifier = resume_notifier;
+
+ if (fast_suspend) {
+ xenbus_suspend();
+ err = stop_machine_run(take_machine_down, &suspend, 0);
+ if (err < 0)
+ xenbus_suspend_cancel();
+ } else {
+ err = take_machine_down(&suspend);
+ }
+
+ if (err < 0)
+ return err;
+
+ suspend_cancelled = err;
+ if (!suspend_cancelled) {
+ xencons_resume();
+ xenbus_resume();
+ } else {
+ xenbus_suspend_cancel();
+ }
+
+ if (!fast_suspend)
+ smp_resume();
+
+ return 0;
+}
--- /dev/null
+#define __KERNEL_SYSCALLS__
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#undef handle_sysrq
+#endif
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define SHUTDOWN_INVALID -1
+#define SHUTDOWN_POWEROFF 0
+#define SHUTDOWN_SUSPEND 2
+#define SHUTDOWN_RESUMING 3
+#define SHUTDOWN_HALT 4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+/* Can we leave APs online when we suspend? */
+static int fast_suspend;
+
+static void __shutdown_handler(struct work_struct *unused);
+static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
+
+int __xen_suspend(int fast_suspend, void (*resume_notifier)(void));
+
+static int shutdown_process(void *__unused)
+{
+ static char *envp[] = { "HOME=/", "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
+ static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
+
+ extern asmlinkage long sys_reboot(int magic1, int magic2,
+ unsigned int cmd, void *arg);
+
+ if ((shutting_down == SHUTDOWN_POWEROFF) ||
+ (shutting_down == SHUTDOWN_HALT)) {
+ if (call_usermodehelper("/sbin/poweroff", poweroff_argv,
+ envp, 0) < 0) {
+#ifdef CONFIG_XEN
+ sys_reboot(LINUX_REBOOT_MAGIC1,
+ LINUX_REBOOT_MAGIC2,
+ LINUX_REBOOT_CMD_POWER_OFF,
+ NULL);
+#endif /* CONFIG_XEN */
+ }
+ }
+
+ shutting_down = SHUTDOWN_INVALID; /* could try again */
+
+ return 0;
+}
+
+static void xen_resume_notifier(void)
+{
+ int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
+ BUG_ON(old_state != SHUTDOWN_SUSPEND);
+}
+
+static int xen_suspend(void *__unused)
+{
+ int err, old_state;
+
+ daemonize("suspend");
+ err = set_cpus_allowed(current, cpumask_of_cpu(0));
+ if (err) {
+ printk(KERN_ERR "Xen suspend can't run on CPU0 (%d)\n", err);
+ goto fail;
+ }
+
+ do {
+ err = __xen_suspend(fast_suspend, xen_resume_notifier);
+ if (err) {
+ printk(KERN_ERR "Xen suspend failed (%d)\n", err);
+ goto fail;
+ }
+ old_state = cmpxchg(
+ &shutting_down, SHUTDOWN_RESUMING, SHUTDOWN_INVALID);
+ } while (old_state == SHUTDOWN_SUSPEND);
+
+ switch (old_state) {
+ case SHUTDOWN_INVALID:
+ case SHUTDOWN_SUSPEND:
+ BUG();
+ case SHUTDOWN_RESUMING:
+ break;
+ default:
+ schedule_delayed_work(&shutdown_work, 0);
+ break;
+ }
+
+ return 0;
+
+ fail:
+ old_state = xchg(&shutting_down, SHUTDOWN_INVALID);
+ BUG_ON(old_state != SHUTDOWN_SUSPEND);
+ return 0;
+}
+
+static void __shutdown_handler(struct work_struct *unused)
+{
+ int err;
+
+ err = kernel_thread((shutting_down == SHUTDOWN_SUSPEND) ?
+ xen_suspend : shutdown_process,
+ NULL, CLONE_FS | CLONE_FILES);
+
+ if (err < 0) {
+ printk(KERN_WARNING "Error creating shutdown process (%d): "
+ "retrying...\n", -err);
+ schedule_delayed_work(&shutdown_work, HZ/2);
+ }
+}
+
+static void shutdown_handler(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ extern void ctrl_alt_del(void);
+ char *str;
+ struct xenbus_transaction xbt;
+ int err, old_state, new_state = SHUTDOWN_INVALID;
+
+ if ((shutting_down != SHUTDOWN_INVALID) &&
+ (shutting_down != SHUTDOWN_RESUMING))
+ return;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+
+ str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+ /* Ignore read errors and empty reads. */
+ if (XENBUS_IS_ERR_READ(str)) {
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ xenbus_write(xbt, "control", "shutdown", "");
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN) {
+ kfree(str);
+ goto again;
+ }
+
+ if (strcmp(str, "poweroff") == 0)
+ new_state = SHUTDOWN_POWEROFF;
+ else if (strcmp(str, "reboot") == 0)
+ ctrl_alt_del();
+ else if (strcmp(str, "suspend") == 0)
+ new_state = SHUTDOWN_SUSPEND;
+ else if (strcmp(str, "halt") == 0)
+ new_state = SHUTDOWN_HALT;
+ else
+ printk("Ignoring shutdown request: %s\n", str);
+
+ if (new_state != SHUTDOWN_INVALID) {
+ old_state = xchg(&shutting_down, new_state);
+ if (old_state == SHUTDOWN_INVALID)
+ schedule_delayed_work(&shutdown_work, 0);
+ else
+ BUG_ON(old_state != SHUTDOWN_RESUMING);
+ }
+
+ kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+ unsigned int len)
+{
+ char sysrq_key = '\0';
+ struct xenbus_transaction xbt;
+ int err;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+ if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+ printk(KERN_ERR "Unable to read sysrq code in "
+ "control/sysrq\n");
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ if (sysrq_key != '\0')
+ xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+
+#ifdef CONFIG_MAGIC_SYSRQ
+ if (sysrq_key != '\0')
+ handle_sysrq(sysrq_key, NULL);
+#endif
+}
+
+static struct xenbus_watch shutdown_watch = {
+ .node = "control/shutdown",
+ .callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+ .node = "control/sysrq",
+ .callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+ int err;
+
+ xenbus_scanf(XBT_NIL, "control",
+ "platform-feature-multiprocessor-suspend",
+ "%d", &fast_suspend);
+
+ err = register_xenbus_watch(&shutdown_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set shutdown watcher\n");
+ return err;
+ }
+
+ err = register_xenbus_watch(&sysrq_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set sysrq watcher\n");
+ return err;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_XEN
+
+static int shutdown_event(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ setup_shutdown_watcher();
+ return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = shutdown_event
+ };
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
+
+#else /* !defined(CONFIG_XEN) */
+
+int xen_reboot_init(void)
+{
+ return setup_shutdown_watcher();
+}
+
+#endif /* !defined(CONFIG_XEN) */
--- /dev/null
+/*
+ * Xen SMP booting functions
+ *
+ * See arch/i386/kernel/smpboot.c for copyright and credits for derived
+ * portions of this file.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/pgalloc.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/xenbus.h>
+
+extern irqreturn_t smp_reschedule_interrupt(int, void *);
+extern irqreturn_t smp_call_function_interrupt(int, void *);
+
+extern int local_setup_timer(unsigned int cpu);
+extern void local_teardown_timer(unsigned int cpu);
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void system_call(void);
+extern void smp_trap_init(trap_info_t *);
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+
+cpumask_t cpu_online_map;
+EXPORT_SYMBOL(cpu_online_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
+cpumask_t cpu_initialized_map;
+
+DEFINE_PER_CPU(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+#ifdef CONFIG_HOTPLUG_CPU
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+#endif
+
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+static char resched_name[NR_CPUS][15];
+static char callfunc_name[NR_CPUS][15];
+
+#ifdef CONFIG_X86_32
+u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+#define set_cpu_to_apicid(cpu, apicid) \
+ (per_cpu(x86_cpu_to_apicid, cpu) = cpu_2_logical_apicid[cpu] = (apicid))
+#else
+#define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
+#endif
+
+DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
+DEFINE_PER_CPU(cpumask_t, cpu_core_map);
+EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+
+void __init prefill_possible_map(void)
+{
+ int i, rc;
+
+ for_each_possible_cpu(i)
+ if (i != smp_processor_id())
+ return;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0)
+ cpu_set(i, cpu_possible_map);
+ }
+}
+
+void __init smp_alloc_memory(void)
+{
+}
+
+static inline void
+set_cpu_sibling_map(unsigned int cpu)
+{
+ cpu_data(cpu).phys_proc_id = cpu;
+ cpu_data(cpu).cpu_core_id = 0;
+
+ per_cpu(cpu_sibling_map, cpu) = cpumask_of_cpu(cpu);
+ per_cpu(cpu_core_map, cpu) = cpumask_of_cpu(cpu);
+
+ cpu_data(cpu).booted_cores = 1;
+}
+
+static void
+remove_siblinginfo(unsigned int cpu)
+{
+ cpu_data(cpu).phys_proc_id = BAD_APICID;
+ cpu_data(cpu).cpu_core_id = BAD_APICID;
+
+ cpus_clear(per_cpu(cpu_sibling_map, cpu));
+ cpus_clear(per_cpu(cpu_core_map, cpu));
+
+ cpu_data(cpu).booted_cores = 0;
+}
+
+static int __cpuinit xen_smp_intr_init(unsigned int cpu)
+{
+ int rc;
+
+ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
+ sprintf(resched_name[cpu], "resched%u", cpu);
+ rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
+ cpu,
+ smp_reschedule_interrupt,
+ IRQF_DISABLED|IRQF_NOBALANCING,
+ resched_name[cpu],
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(resched_irq, cpu) = rc;
+
+ sprintf(callfunc_name[cpu], "callfunc%u", cpu);
+ rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
+ cpu,
+ smp_call_function_interrupt,
+ IRQF_DISABLED|IRQF_NOBALANCING,
+ callfunc_name[cpu],
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(callfunc_irq, cpu) = rc;
+
+ if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
+ goto fail;
+
+ return 0;
+
+ fail:
+ if (per_cpu(resched_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ if (per_cpu(callfunc_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ return rc;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __cpuexit xen_smp_intr_exit(unsigned int cpu)
+{
+ if (cpu != 0)
+ local_teardown_timer(cpu);
+
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+}
+#endif
+
+void __cpuinit cpu_bringup(void)
+{
+ cpu_init();
+#ifdef __i386__
+ identify_secondary_cpu(¤t_cpu_data);
+#else
+ identify_cpu(¤t_cpu_data);
+#endif
+ touch_softlockup_watchdog();
+ preempt_disable();
+ local_irq_enable();
+}
+
+static void __cpuinit cpu_bringup_and_idle(void)
+{
+ cpu_bringup();
+ cpu_idle();
+}
+
+static void __cpuinit cpu_initialize_context(unsigned int cpu)
+{
+ /* vcpu_guest_context_t is too large to allocate on the stack.
+ * Hence we allocate statically and protect it with a lock */
+ static vcpu_guest_context_t ctxt;
+ static DEFINE_SPINLOCK(ctxt_lock);
+
+ struct task_struct *idle = idle_task(cpu);
+
+ if (cpu_test_and_set(cpu, cpu_initialized_map))
+ return;
+
+ spin_lock(&ctxt_lock);
+
+ memset(&ctxt, 0, sizeof(ctxt));
+
+ ctxt.flags = VGCF_IN_KERNEL;
+ ctxt.user_regs.ds = __USER_DS;
+ ctxt.user_regs.es = __USER_DS;
+ ctxt.user_regs.fs = 0;
+ ctxt.user_regs.gs = 0;
+ ctxt.user_regs.ss = __KERNEL_DS;
+ ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+ ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
+
+ memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
+
+ smp_trap_init(ctxt.trap_ctxt);
+
+ ctxt.ldt_ents = 0;
+ ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
+ ctxt.gdt_ents = GDT_SIZE / 8;
+
+ ctxt.user_regs.cs = __KERNEL_CS;
+ ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+
+ ctxt.kernel_ss = __KERNEL_DS;
+ ctxt.kernel_sp = idle->thread.sp0;
+
+ ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
+ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+#ifdef __i386__
+ ctxt.event_callback_cs = __KERNEL_CS;
+ ctxt.failsafe_callback_cs = __KERNEL_CS;
+
+ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+ ctxt.user_regs.fs = __KERNEL_PERCPU;
+#else /* __x86_64__ */
+ ctxt.syscall_callback_eip = (unsigned long)system_call;
+
+ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
+
+ ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
+#endif
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt))
+ BUG();
+
+ spin_unlock(&ctxt_lock);
+}
+
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+ unsigned int cpu;
+ struct task_struct *idle;
+ int apicid;
+ struct vcpu_get_physid cpu_id;
+#ifdef __x86_64__
+ struct desc_ptr *gdt_descr;
+#endif
+ void *gdt_addr;
+
+ apicid = 0;
+ if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
+ apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
+ boot_cpu_data.apicid = apicid;
+ cpu_data(0) = boot_cpu_data;
+
+ set_cpu_to_apicid(0, apicid);
+
+ current_thread_info()->cpu = 0;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ cpus_clear(per_cpu(cpu_sibling_map, cpu));
+ cpus_clear(per_cpu(cpu_core_map, cpu));
+ }
+
+ set_cpu_sibling_map(0);
+
+ if (xen_smp_intr_init(0))
+ BUG();
+
+ cpu_initialized_map = cpumask_of_cpu(0);
+
+ /* Restrict the possible_map according to max_cpus. */
+ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+ for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+ continue;
+ cpu_clear(cpu, cpu_possible_map);
+ }
+
+ for_each_possible_cpu (cpu) {
+ if (cpu == 0)
+ continue;
+
+ idle = fork_idle(cpu);
+ if (IS_ERR(idle))
+ panic("failed fork for CPU %d", cpu);
+
+#ifdef __x86_64__
+ gdt_descr = &cpu_gdt_descr[cpu];
+ gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+ if (unlikely(!gdt_descr->address)) {
+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
+ cpu);
+ continue;
+ }
+ gdt_descr->size = GDT_SIZE;
+ memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
+ gdt_addr = (void *)gdt_descr->address;
+#else
+ init_gdt(cpu);
+ gdt_addr = get_cpu_gdt_table(cpu);
+#endif
+ make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
+
+ apicid = cpu;
+ if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
+ apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
+ cpu_data(cpu) = boot_cpu_data;
+ cpu_data(cpu).cpu_index = cpu;
+ cpu_data(cpu).apicid = apicid;
+
+ set_cpu_to_apicid(cpu, apicid);
+
+#ifdef __x86_64__
+ cpu_pda(cpu)->pcurrent = idle;
+ cpu_pda(cpu)->cpunumber = cpu;
+ clear_ti_thread_flag(task_thread_info(idle), TIF_FORK);
+#else
+ per_cpu(current_task, cpu) = idle;
+#endif
+
+ irq_ctx_init(cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ if (is_initial_xendomain())
+ cpu_set(cpu, cpu_present_map);
+#else
+ cpu_set(cpu, cpu_present_map);
+#endif
+ }
+
+ init_xenbus_allowed_cpumask();
+
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Here we can be sure that there is an IO-APIC in the system. Let's
+ * go and set it up:
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+#endif
+}
+
+void __init smp_prepare_boot_cpu(void)
+{
+#ifdef __i386__
+ init_gdt(smp_processor_id());
+ switch_to_new_gdt();
+#endif
+ prefill_possible_map();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
+ * But do it early enough to catch critical for_each_present_cpu() loops
+ * in i386-specific code.
+ */
+static int __init initialize_cpu_present_map(void)
+{
+ cpu_present_map = cpu_possible_map;
+ return 0;
+}
+core_initcall(initialize_cpu_present_map);
+
+int __cpuexit __cpu_disable(void)
+{
+ cpumask_t map = cpu_online_map;
+ unsigned int cpu = smp_processor_id();
+
+ if (cpu == 0)
+ return -EBUSY;
+
+ remove_siblinginfo(cpu);
+
+ cpu_clear(cpu, map);
+ fixup_irqs(map);
+ cpu_clear(cpu, cpu_online_map);
+
+ return 0;
+}
+
+void __cpuexit __cpu_die(unsigned int cpu)
+{
+ while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(HZ/10);
+ }
+
+ xen_smp_intr_exit(cpu);
+
+ if (num_online_cpus() == 1)
+ alternatives_smp_switch(0);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __cpuinit __cpu_up(unsigned int cpu)
+{
+ int rc;
+
+ rc = cpu_up_check(cpu);
+ if (rc)
+ return rc;
+
+ cpu_initialize_context(cpu);
+
+ if (num_online_cpus() == 1)
+ alternatives_smp_switch(1);
+
+ /* This must be done before setting cpu_online_map */
+ set_cpu_sibling_map(cpu);
+ wmb();
+
+ rc = xen_smp_intr_init(cpu);
+ if (rc) {
+ remove_siblinginfo(cpu);
+ return rc;
+ }
+
+ cpu_set(cpu, cpu_online_map);
+
+ rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+ BUG_ON(rc);
+
+ return 0;
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+#ifndef CONFIG_X86_LOCAL_APIC
+int setup_profiling_timer(unsigned int multiplier)
+{
+ return -EINVAL;
+}
+#endif
--- /dev/null
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <xen/xen_proc.h>
+
+static struct proc_dir_entry *xen_base;
+
+struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
+{
+ if ( xen_base == NULL )
+ if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
+ panic("Couldn't create /proc/xen");
+ return create_proc_entry(name, mode, xen_base);
+}
+
+EXPORT_SYMBOL_GPL(create_xen_proc_entry);
+
+void remove_xen_proc_entry(const char *name)
+{
+ remove_proc_entry(name, xen_base);
+}
+
+EXPORT_SYMBOL_GPL(remove_xen_proc_entry);
--- /dev/null
+/*
+ * copyright (c) 2006 IBM Corporation
+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/hypervisor.h>
+#include <xen/features.h>
+#include <xen/hypervisor_sysfs.h>
+#include <xen/xenbus.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
+
+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ return sprintf(buffer, "xen\n");
+}
+
+HYPERVISOR_ATTR_RO(type);
+
+static int __init xen_sysfs_type_init(void)
+{
+ return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+}
+
+static void xen_sysfs_type_destroy(void)
+{
+ sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+}
+
+/* xen version attributes */
+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if (version)
+ return sprintf(buffer, "%d\n", version >> 16);
+ return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(major);
+
+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if (version)
+ return sprintf(buffer, "%d\n", version & 0xff);
+ return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(minor);
+
+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ char *extra;
+
+ extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
+ if (extra) {
+ ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", extra);
+ kfree(extra);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(extra);
+
+static struct attribute *version_attrs[] = {
+ &major_attr.attr,
+ &minor_attr.attr,
+ &extra_attr.attr,
+ NULL
+};
+
+static struct attribute_group version_group = {
+ .name = "version",
+ .attrs = version_attrs,
+};
+
+static int __init xen_sysfs_version_init(void)
+{
+ return sysfs_create_group(hypervisor_kobj, &version_group);
+}
+
+static void xen_sysfs_version_destroy(void)
+{
+ sysfs_remove_group(hypervisor_kobj, &version_group);
+}
+
+/* UUID */
+
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ char *vm, *val;
+ int ret;
+ extern int xenstored_ready;
+
+ if (!xenstored_ready)
+ return -EBUSY;
+
+ vm = xenbus_read(XBT_NIL, "vm", "", NULL);
+ if (IS_ERR(vm))
+ return PTR_ERR(vm);
+ val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
+ kfree(vm);
+ if (IS_ERR(val))
+ return PTR_ERR(val);
+ ret = sprintf(buffer, "%s\n", val);
+ kfree(val);
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(uuid);
+
+static int __init xen_sysfs_uuid_init(void)
+{
+ return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+static void xen_sysfs_uuid_destroy(void)
+{
+ sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+/* xen compilation attributes */
+
+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_compile_info *info;
+
+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+ if (info) {
+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", info->compiler);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiler);
+
+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_compile_info *info;
+
+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+ if (info) {
+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", info->compile_by);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiled_by);
+
+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_compile_info *info;
+
+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+ if (info) {
+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", info->compile_date);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(compile_date);
+
+static struct attribute *xen_compile_attrs[] = {
+ &compiler_attr.attr,
+ &compiled_by_attr.attr,
+ &compile_date_attr.attr,
+ NULL
+};
+
+static struct attribute_group xen_compilation_group = {
+ .name = "compilation",
+ .attrs = xen_compile_attrs,
+};
+
+int __init static xen_compilation_init(void)
+{
+ return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+static void xen_compilation_destroy(void)
+{
+ sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+/* xen properties info */
+
+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ char *caps;
+
+ caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
+ if (caps) {
+ ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", caps);
+ kfree(caps);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(capabilities);
+
+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ char *cset;
+
+ cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
+ if (cset) {
+ ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", cset);
+ kfree(cset);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(changeset);
+
+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_platform_parameters *parms;
+
+ parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
+ if (parms) {
+ ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
+ parms);
+ if (!ret)
+ ret = sprintf(buffer, "%lx\n", parms->virt_start);
+ kfree(parms);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(virtual_start);
+
+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret;
+
+ ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
+ if (ret > 0)
+ ret = sprintf(buffer, "%x\n", ret);
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(pagesize);
+
+/* eventually there will be several more features to export */
+static ssize_t xen_feature_show(int index, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_feature_info *info;
+
+ info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
+ if (info) {
+ info->submap_idx = index;
+ ret = HYPERVISOR_xen_version(XENVER_get_features, info);
+ if (!ret)
+ ret = sprintf(buffer, "%d\n", info->submap);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ return xen_feature_show(XENFEAT_writable_page_tables, buffer);
+}
+
+HYPERVISOR_ATTR_RO(writable_pt);
+
+static struct attribute *xen_properties_attrs[] = {
+ &capabilities_attr.attr,
+ &changeset_attr.attr,
+ &virtual_start_attr.attr,
+ &pagesize_attr.attr,
+ &writable_pt_attr.attr,
+ NULL
+};
+
+static struct attribute_group xen_properties_group = {
+ .name = "properties",
+ .attrs = xen_properties_attrs,
+};
+
+static int __init xen_properties_init(void)
+{
+ return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static void xen_properties_destroy(void)
+{
+ sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static int __init hyper_sysfs_init(void)
+{
+ int ret;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ ret = xen_sysfs_type_init();
+ if (ret)
+ goto out;
+ ret = xen_sysfs_version_init();
+ if (ret)
+ goto version_out;
+ ret = xen_compilation_init();
+ if (ret)
+ goto comp_out;
+ ret = xen_sysfs_uuid_init();
+ if (ret)
+ goto uuid_out;
+ ret = xen_properties_init();
+ if (!ret)
+ goto out;
+
+ xen_sysfs_uuid_destroy();
+uuid_out:
+ xen_compilation_destroy();
+comp_out:
+ xen_sysfs_version_destroy();
+version_out:
+ xen_sysfs_type_destroy();
+out:
+ return ret;
+}
+
+static void __exit hyper_sysfs_exit(void)
+{
+ xen_properties_destroy();
+ xen_compilation_destroy();
+ xen_sysfs_uuid_destroy();
+ xen_sysfs_version_destroy();
+ xen_sysfs_type_destroy();
+
+}
+
+module_init(hyper_sysfs_init);
+module_exit(hyper_sysfs_exit);
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (C) IBM Corp. 2006
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <xen/xencomm.h>
+#include <xen/interface/xen.h>
+#ifdef __ia64__
+#include <asm/xen/xencomm.h> /* for is_kern_addr() */
+#endif
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+static int xencomm_init(struct xencomm_desc *desc,
+ void *buffer, unsigned long bytes)
+{
+ unsigned long recorded = 0;
+ int i = 0;
+
+ while ((recorded < bytes) && (i < desc->nr_addrs)) {
+ unsigned long vaddr = (unsigned long)buffer + recorded;
+ unsigned long paddr;
+ int offset;
+ int chunksz;
+
+ offset = vaddr % PAGE_SIZE; /* handle partial pages */
+ chunksz = min(PAGE_SIZE - offset, bytes - recorded);
+
+ paddr = xencomm_vtop(vaddr);
+ if (paddr == ~0UL) {
+ printk("%s: couldn't translate vaddr %lx\n",
+ __func__, vaddr);
+ return -EINVAL;
+ }
+
+ desc->address[i++] = paddr;
+ recorded += chunksz;
+ }
+
+ if (recorded < bytes) {
+ printk("%s: could only translate %ld of %ld bytes\n",
+ __func__, recorded, bytes);
+ return -ENOSPC;
+ }
+
+ /* mark remaining addresses invalid (just for safety) */
+ while (i < desc->nr_addrs)
+ desc->address[i++] = XENCOMM_INVALID;
+
+ desc->magic = XENCOMM_MAGIC;
+
+ return 0;
+}
+
+static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask,
+ void *buffer, unsigned long bytes)
+{
+ struct xencomm_desc *desc;
+ unsigned long buffer_ulong = (unsigned long)buffer;
+ unsigned long start = buffer_ulong & PAGE_MASK;
+ unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK;
+ unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT;
+ unsigned long size = sizeof(*desc) +
+ sizeof(desc->address[0]) * nr_addrs;
+
+ /*
+ * slab allocator returns at least sizeof(void*) aligned pointer.
+ * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might
+ * cross page boundary.
+ */
+ if (sizeof(*desc) > sizeof(void*)) {
+ unsigned long order = get_order(size);
+ desc = (struct xencomm_desc *)__get_free_pages(gfp_mask,
+ order);
+ if (desc == NULL)
+ return NULL;
+
+ desc->nr_addrs =
+ ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) /
+ sizeof(*desc->address);
+ } else {
+ desc = kmalloc(size, gfp_mask);
+ if (desc == NULL)
+ return NULL;
+
+ desc->nr_addrs = nr_addrs;
+ }
+ return desc;
+}
+
+void xencomm_free(struct xencomm_handle *desc)
+{
+ if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) {
+ struct xencomm_desc *desc__ = (struct xencomm_desc*)desc;
+ if (sizeof(*desc__) > sizeof(void*)) {
+ unsigned long size = sizeof(*desc__) +
+ sizeof(desc__->address[0]) * desc__->nr_addrs;
+ unsigned long order = get_order(size);
+ free_pages((unsigned long)__va(desc), order);
+ } else
+ kfree(__va(desc));
+ }
+}
+
+static int xencomm_create(void *buffer, unsigned long bytes, struct xencomm_desc **ret, gfp_t gfp_mask)
+{
+ struct xencomm_desc *desc;
+ int rc;
+
+ pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes);
+
+ if (bytes == 0) {
+ /* don't create a descriptor; Xen recognizes NULL. */
+ BUG_ON(buffer != NULL);
+ *ret = NULL;
+ return 0;
+ }
+
+ BUG_ON(buffer == NULL); /* 'bytes' is non-zero */
+
+ desc = xencomm_alloc(gfp_mask, buffer, bytes);
+ if (!desc) {
+ printk("%s failure\n", "xencomm_alloc");
+ return -ENOMEM;
+ }
+
+ rc = xencomm_init(desc, buffer, bytes);
+ if (rc) {
+ printk("%s failure: %d\n", "xencomm_init", rc);
+ xencomm_free((struct xencomm_handle *)__pa(desc));
+ return rc;
+ }
+
+ *ret = desc;
+ return 0;
+}
+
+/* check if memory address is within VMALLOC region */
+static int is_phys_contiguous(unsigned long addr)
+{
+ if (!is_kernel_addr(addr))
+ return 0;
+
+ return (addr < VMALLOC_START) || (addr >= VMALLOC_END);
+}
+
+static struct xencomm_handle *xencomm_create_inline(void *ptr)
+{
+ unsigned long paddr;
+
+ BUG_ON(!is_phys_contiguous((unsigned long)ptr));
+
+ paddr = (unsigned long)xencomm_pa(ptr);
+ BUG_ON(paddr & XENCOMM_INLINE_FLAG);
+ return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
+}
+
+/* "mini" routine, for stack-based communications: */
+static int xencomm_create_mini(void *buffer,
+ unsigned long bytes, struct xencomm_mini *xc_desc,
+ struct xencomm_desc **ret)
+{
+ int rc = 0;
+ struct xencomm_desc *desc;
+ BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0);
+
+ desc = (void *)xc_desc;
+
+ desc->nr_addrs = XENCOMM_MINI_ADDRS;
+
+ if (!(rc = xencomm_init(desc, buffer, bytes)))
+ *ret = desc;
+
+ return rc;
+}
+
+struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes)
+{
+ int rc;
+ struct xencomm_desc *desc;
+
+ if (is_phys_contiguous((unsigned long)ptr))
+ return xencomm_create_inline(ptr);
+
+ rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL);
+
+ if (rc || desc == NULL)
+ return NULL;
+
+ return xencomm_pa(desc);
+}
+
+struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes,
+ struct xencomm_mini *xc_desc)
+{
+ int rc;
+ struct xencomm_desc *desc = NULL;
+
+ if (is_phys_contiguous((unsigned long)ptr))
+ return xencomm_create_inline(ptr);
+
+ rc = xencomm_create_mini(ptr, bytes, xc_desc,
+ &desc);
+
+ if (rc)
+ return NULL;
+
+ return xencomm_pa(desc);
+}
--- /dev/null
+
+obj-y := evtchn.o
--- /dev/null
+/******************************************************************************
+ * evtchn.c
+ *
+ * Driver for receiving and demuxing event-channel signals.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Multi-process extensions Copyright (c) 2004, Steven Smith
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <xen/evtchn.h>
+#include <xen/public/evtchn.h>
+
+struct per_user_data {
+ /* Notification ring, accessed via /dev/xen/evtchn. */
+#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t))
+#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
+ evtchn_port_t *ring;
+ unsigned int ring_cons, ring_prod, ring_overflow;
+ struct mutex ring_cons_mutex; /* protect against concurrent readers */
+
+ /* Processes wait on this queue when ring is empty. */
+ wait_queue_head_t evtchn_wait;
+ struct fasync_struct *evtchn_async_queue;
+
+ int bind_cpu;
+ int nr_event_wrong_delivery;
+};
+
+/* Who's bound to each port? */
+static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+static spinlock_t port_user_lock;
+
+void evtchn_device_upcall(int port)
+{
+ struct per_user_data *u;
+
+ spin_lock(&port_user_lock);
+
+ mask_evtchn(port);
+ clear_evtchn(port);
+
+ if ((u = port_user[port]) != NULL) {
+ if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+ u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+ if (u->ring_cons == u->ring_prod++) {
+ wake_up_interruptible(&u->evtchn_wait);
+ kill_fasync(&u->evtchn_async_queue,
+ SIGIO, POLL_IN);
+ }
+ } else {
+ u->ring_overflow = 1;
+ }
+ }
+
+ spin_unlock(&port_user_lock);
+}
+
+static void evtchn_check_wrong_delivery(struct per_user_data *u)
+{
+ evtchn_port_t port;
+ unsigned int current_cpu = smp_processor_id();
+
+ /* Delivered to correct CPU? All is good. */
+ if (u->bind_cpu == current_cpu) {
+ u->nr_event_wrong_delivery = 0;
+ return;
+ }
+
+ /* Tolerate up to 100 consecutive misdeliveries. */
+ if (++u->nr_event_wrong_delivery < 100)
+ return;
+
+ spin_lock_irq(&port_user_lock);
+
+ for (port = 0; port < NR_EVENT_CHANNELS; port++)
+ if (port_user[port] == u)
+ rebind_evtchn_to_cpu(port, current_cpu);
+
+ u->bind_cpu = current_cpu;
+ u->nr_event_wrong_delivery = 0;
+
+ spin_unlock_irq(&port_user_lock);
+}
+
+static ssize_t evtchn_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int rc;
+ unsigned int c, p, bytes1 = 0, bytes2 = 0;
+ struct per_user_data *u = file->private_data;
+
+ /* Whole number of ports. */
+ count &= ~(sizeof(evtchn_port_t)-1);
+
+ if (count == 0)
+ return 0;
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ for (;;) {
+ mutex_lock(&u->ring_cons_mutex);
+
+ rc = -EFBIG;
+ if (u->ring_overflow)
+ goto unlock_out;
+
+ if ((c = u->ring_cons) != (p = u->ring_prod))
+ break;
+
+ mutex_unlock(&u->ring_cons_mutex);
+
+ if (file->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ rc = wait_event_interruptible(
+ u->evtchn_wait, u->ring_cons != u->ring_prod);
+ if (rc)
+ return rc;
+ }
+
+ /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+ if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
+ bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
+ sizeof(evtchn_port_t);
+ bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
+ } else {
+ bytes1 = (p - c) * sizeof(evtchn_port_t);
+ bytes2 = 0;
+ }
+
+ /* Truncate chunks according to caller's maximum byte count. */
+ if (bytes1 > count) {
+ bytes1 = count;
+ bytes2 = 0;
+ } else if ((bytes1 + bytes2) > count) {
+ bytes2 = count - bytes1;
+ }
+
+ rc = -EFAULT;
+ if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
+ ((bytes2 != 0) &&
+ copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
+ goto unlock_out;
+
+ evtchn_check_wrong_delivery(u);
+
+ u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
+ rc = bytes1 + bytes2;
+
+ unlock_out:
+ mutex_unlock(&u->ring_cons_mutex);
+ return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int rc, i;
+ evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+ struct per_user_data *u = file->private_data;
+
+ if (kbuf == NULL)
+ return -ENOMEM;
+
+ /* Whole number of ports. */
+ count &= ~(sizeof(evtchn_port_t)-1);
+
+ rc = 0;
+ if (count == 0)
+ goto out;
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ rc = -EFAULT;
+ if (copy_from_user(kbuf, buf, count) != 0)
+ goto out;
+
+ spin_lock_irq(&port_user_lock);
+ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+ if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+ unmask_evtchn(kbuf[i]);
+ spin_unlock_irq(&port_user_lock);
+
+ rc = count;
+
+ out:
+ free_page((unsigned long)kbuf);
+ return rc;
+}
+
+static unsigned int next_bind_cpu(cpumask_t map)
+{
+ static unsigned int bind_cpu;
+ bind_cpu = next_cpu(bind_cpu, map);
+ if (bind_cpu >= NR_CPUS)
+ bind_cpu = first_cpu(map);
+ return bind_cpu;
+}
+
+static void evtchn_bind_to_user(struct per_user_data *u, int port)
+{
+ spin_lock_irq(&port_user_lock);
+
+ BUG_ON(port_user[port] != NULL);
+ port_user[port] = u;
+
+ if (u->bind_cpu == -1)
+ u->bind_cpu = next_bind_cpu(cpu_online_map);
+
+ rebind_evtchn_to_cpu(port, u->bind_cpu);
+
+ unmask_evtchn(port);
+
+ spin_unlock_irq(&port_user_lock);
+}
+
+static long evtchn_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ int rc;
+ struct per_user_data *u = file->private_data;
+ void __user *uarg = (void __user *) arg;
+
+ switch (cmd) {
+ case IOCTL_EVTCHN_BIND_VIRQ: {
+ struct ioctl_evtchn_bind_virq bind;
+ struct evtchn_bind_virq bind_virq;
+
+ rc = -EFAULT;
+ if (copy_from_user(&bind, uarg, sizeof(bind)))
+ break;
+
+ bind_virq.virq = bind.virq;
+ bind_virq.vcpu = 0;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq);
+ if (rc != 0)
+ break;
+
+ rc = bind_virq.port;
+ evtchn_bind_to_user(u, rc);
+ break;
+ }
+
+ case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
+ struct ioctl_evtchn_bind_interdomain bind;
+ struct evtchn_bind_interdomain bind_interdomain;
+
+ rc = -EFAULT;
+ if (copy_from_user(&bind, uarg, sizeof(bind)))
+ break;
+
+ bind_interdomain.remote_dom = bind.remote_domain;
+ bind_interdomain.remote_port = bind.remote_port;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+ if (rc != 0)
+ break;
+
+ rc = bind_interdomain.local_port;
+ evtchn_bind_to_user(u, rc);
+ break;
+ }
+
+ case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
+ struct ioctl_evtchn_bind_unbound_port bind;
+ struct evtchn_alloc_unbound alloc_unbound;
+
+ rc = -EFAULT;
+ if (copy_from_user(&bind, uarg, sizeof(bind)))
+ break;
+
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = bind.remote_domain;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (rc != 0)
+ break;
+
+ rc = alloc_unbound.port;
+ evtchn_bind_to_user(u, rc);
+ break;
+ }
+
+ case IOCTL_EVTCHN_UNBIND: {
+ struct ioctl_evtchn_unbind unbind;
+ struct evtchn_close close;
+ int ret;
+
+ rc = -EFAULT;
+ if (copy_from_user(&unbind, uarg, sizeof(unbind)))
+ break;
+
+ rc = -EINVAL;
+ if (unbind.port >= NR_EVENT_CHANNELS)
+ break;
+
+ spin_lock_irq(&port_user_lock);
+
+ rc = -ENOTCONN;
+ if (port_user[unbind.port] != u) {
+ spin_unlock_irq(&port_user_lock);
+ break;
+ }
+
+ port_user[unbind.port] = NULL;
+ mask_evtchn(unbind.port);
+ rebind_evtchn_to_cpu(unbind.port, 0);
+
+ spin_unlock_irq(&port_user_lock);
+
+ close.port = unbind.port;
+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+ BUG_ON(ret);
+
+ rc = 0;
+ break;
+ }
+
+ case IOCTL_EVTCHN_NOTIFY: {
+ struct ioctl_evtchn_notify notify;
+
+ rc = -EFAULT;
+ if (copy_from_user(¬ify, uarg, sizeof(notify)))
+ break;
+
+ if (notify.port >= NR_EVENT_CHANNELS) {
+ rc = -EINVAL;
+ } else if (port_user[notify.port] != u) {
+ rc = -ENOTCONN;
+ } else {
+ notify_remote_via_evtchn(notify.port);
+ rc = 0;
+ }
+ break;
+ }
+
+ case IOCTL_EVTCHN_RESET: {
+ /* Initialise the ring to empty. Clear errors. */
+ mutex_lock(&u->ring_cons_mutex);
+ spin_lock_irq(&port_user_lock);
+ u->ring_cons = u->ring_prod = u->ring_overflow = 0;
+ spin_unlock_irq(&port_user_lock);
+ mutex_unlock(&u->ring_cons_mutex);
+ rc = 0;
+ break;
+ }
+
+ default:
+ rc = -ENOSYS;
+ break;
+ }
+
+ return rc;
+}
+
+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
+{
+ unsigned int mask = POLLOUT | POLLWRNORM;
+ struct per_user_data *u = file->private_data;
+
+ poll_wait(file, &u->evtchn_wait, wait);
+ if (u->ring_cons != u->ring_prod)
+ mask |= POLLIN | POLLRDNORM;
+ if (u->ring_overflow)
+ mask = POLLERR;
+ return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+ struct per_user_data *u = filp->private_data;
+ return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+ struct per_user_data *u;
+
+ if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+
+ memset(u, 0, sizeof(*u));
+ init_waitqueue_head(&u->evtchn_wait);
+
+ u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+ if (u->ring == NULL) {
+ kfree(u);
+ return -ENOMEM;
+ }
+
+ mutex_init(&u->ring_cons_mutex);
+
+ filp->private_data = u;
+
+ u->bind_cpu = -1;
+
+ return 0;
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+ int i;
+ struct per_user_data *u = filp->private_data;
+ struct evtchn_close close;
+
+ spin_lock_irq(&port_user_lock);
+
+ free_page((unsigned long)u->ring);
+
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+ int ret;
+ if (port_user[i] != u)
+ continue;
+
+ port_user[i] = NULL;
+ mask_evtchn(i);
+ rebind_evtchn_to_cpu(i, 0);
+
+ close.port = i;
+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+ BUG_ON(ret);
+ }
+
+ spin_unlock_irq(&port_user_lock);
+
+ kfree(u);
+
+ return 0;
+}
+
+static const struct file_operations evtchn_fops = {
+ .owner = THIS_MODULE,
+ .read = evtchn_read,
+ .write = evtchn_write,
+ .unlocked_ioctl = evtchn_ioctl,
+ .poll = evtchn_poll,
+ .fasync = evtchn_fasync,
+ .open = evtchn_open,
+ .release = evtchn_release,
+};
+
+static struct miscdevice evtchn_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "evtchn",
+ .fops = &evtchn_fops,
+};
+
+static int __cpuinit evtchn_cpu_notify(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int hotcpu = (unsigned long)hcpu;
+ cpumask_t map = cpu_online_map;
+ int port, newcpu;
+ struct per_user_data *u;
+
+ switch (action) {
+ case CPU_DOWN_PREPARE:
+ cpu_clear(hotcpu, map);
+ spin_lock_irq(&port_user_lock);
+ for (port = 0; port < NR_EVENT_CHANNELS; port++) {
+ if ((u = port_user[port]) != NULL &&
+ u->bind_cpu == hotcpu &&
+ (newcpu = next_bind_cpu(map)) < NR_CPUS) {
+ rebind_evtchn_to_cpu(port, newcpu);
+ u->bind_cpu = newcpu;
+ }
+ }
+ spin_unlock_irq(&port_user_lock);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata evtchn_cpu_nfb = {
+ .notifier_call = evtchn_cpu_notify
+};
+
+static int __init evtchn_init(void)
+{
+ int err;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ spin_lock_init(&port_user_lock);
+ memset(port_user, 0, sizeof(port_user));
+
+ /* Create '/dev/misc/evtchn'. */
+ err = misc_register(&evtchn_miscdev);
+ if (err != 0) {
+ printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+ return err;
+ }
+
+ register_cpu_notifier(&evtchn_cpu_nfb);
+
+ printk("Event-channel device installed.\n");
+
+ return 0;
+}
+
+static void __exit evtchn_cleanup(void)
+{
+ misc_deregister(&evtchn_miscdev);
+ unregister_cpu_notifier(&evtchn_cpu_nfb);
+}
+
+module_init(evtchn_init);
+module_exit(evtchn_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+obj-$(CONFIG_XEN_FRAMEBUFFER) := xenfb.o
+obj-$(CONFIG_XEN_KEYBOARD) += xenkbd.o
--- /dev/null
+/*
+ * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device
+ *
+ * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
+ *
+ * Based on linux/drivers/video/q40fb.c
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive for
+ * more details.
+ */
+
+/*
+ * TODO:
+ *
+ * Switch to grant tables when they become capable of dealing with the
+ * frame buffer.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/fbif.h>
+#include <xen/interface/io/protocols.h>
+#include <xen/xenbus.h>
+#include <linux/kthread.h>
+
+struct xenfb_mapping
+{
+ struct list_head link;
+ struct vm_area_struct *vma;
+ atomic_t map_refs;
+ int faults;
+ struct xenfb_info *info;
+};
+
+struct xenfb_info
+{
+ struct task_struct *kthread;
+ wait_queue_head_t wq;
+
+ unsigned char *fb;
+ struct fb_info *fb_info;
+ struct timer_list refresh;
+ int dirty;
+ int x1, y1, x2, y2; /* dirty rectangle,
+ protected by dirty_lock */
+ spinlock_t dirty_lock;
+ struct mutex mm_lock;
+ int nr_pages;
+ struct page **pages;
+ struct list_head mappings; /* protected by mm_lock */
+
+ int irq;
+ struct xenfb_page *page;
+ unsigned long *mfns;
+ int update_wanted; /* XENFB_TYPE_UPDATE wanted */
+ int feature_resize; /* Backend has resize feature */
+ struct xenfb_resize resize;
+ int resize_dpy;
+ spinlock_t resize_lock;
+
+ struct xenbus_device *xbdev;
+};
+
+/*
+ * There are three locks:
+ * spinlock resize_lock protecting resize_dpy and resize
+ * spinlock dirty_lock protecting the dirty rectangle
+ * mutex mm_lock protecting mappings.
+ *
+ * How the dirty and mapping locks work together
+ *
+ * The problem is that dirty rectangle and mappings aren't
+ * independent: the dirty rectangle must cover all faulted pages in
+ * mappings. We need to prove that our locking maintains this
+ * invariant.
+ *
+ * There are several kinds of critical regions:
+ *
+ * 1. Holding only dirty_lock: xenfb_refresh(). May run in
+ * interrupts. Extends the dirty rectangle. Trivially preserves
+ * invariant.
+ *
+ * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close(). Touch
+ * only mappings. The former creates unfaulted pages. Preserves
+ * invariant. The latter removes pages. Preserves invariant.
+ *
+ * 3. Holding both locks: xenfb_vm_fault(). Extends the dirty
+ * rectangle and updates mappings consistently. Preserves
+ * invariant.
+ *
+ * 4. The ugliest one: xenfb_update_screen(). Clear the dirty
+ * rectangle and update mappings consistently.
+ *
+ * We can't simply hold both locks, because zap_page_range() cannot
+ * be called with a spinlock held.
+ *
+ * Therefore, we first clear the dirty rectangle with both locks
+ * held. Then we unlock dirty_lock and update the mappings.
+ * Critical regions that hold only dirty_lock may interfere with
+ * that. This can only be region 1: xenfb_refresh(). But that
+ * just extends the dirty rectangle, which can't harm the
+ * invariant.
+ *
+ * But FIXME: the invariant is too weak. It misses that the fault
+ * record in mappings must be consistent with the mapping of pages in
+ * the associated address space! __do_fault() updates the PTE after
+ * xenfb_vm_fault() returns, i.e. outside the critical region. This
+ * allows the following race:
+ *
+ * X writes to some address in the Xen frame buffer
+ * Fault - call __do_fault()
+ * call xenfb_vm_fault()
+ * grab mm_lock
+ * map->faults++;
+ * release mm_lock
+ * return back to do_no_page()
+ * (preempted, or SMP)
+ * Xen worker thread runs.
+ * grab mm_lock
+ * look at mappings
+ * find this mapping, zaps its pages (but page not in pte yet)
+ * clear map->faults
+ * releases mm_lock
+ * (back to X process)
+ * put page in X's pte
+ *
+ * Oh well, we wont be updating the writes to this page anytime soon.
+ */
+#define MB_ (1024*1024)
+#define XENFB_DEFAULT_FB_LEN (XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8)
+
+enum {KPARAM_MEM, KPARAM_WIDTH, KPARAM_HEIGHT, KPARAM_CNT};
+static int video[KPARAM_CNT] = {2, XENFB_WIDTH, XENFB_HEIGHT};
+module_param_array(video, int, NULL, 0);
+MODULE_PARM_DESC(video,
+ "Size of video memory in MB and width,height in pixels, default = (2,800,600)");
+
+static int xenfb_fps = 20;
+
+static int xenfb_remove(struct xenbus_device *);
+static void xenfb_init_shared_page(struct xenfb_info *, struct fb_info *);
+static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
+static void xenfb_disconnect_backend(struct xenfb_info *);
+
+static void xenfb_send_event(struct xenfb_info *info,
+ union xenfb_out_event *event)
+{
+ __u32 prod;
+
+ prod = info->page->out_prod;
+ /* caller ensures !xenfb_queue_full() */
+ mb(); /* ensure ring space available */
+ XENFB_OUT_RING_REF(info->page, prod) = *event;
+ wmb(); /* ensure ring contents visible */
+ info->page->out_prod = prod + 1;
+
+ notify_remote_via_irq(info->irq);
+}
+
+static void xenfb_do_update(struct xenfb_info *info,
+ int x, int y, int w, int h)
+{
+ union xenfb_out_event event;
+
+ memset(&event, 0, sizeof(event));
+ event.type = XENFB_TYPE_UPDATE;
+ event.update.x = x;
+ event.update.y = y;
+ event.update.width = w;
+ event.update.height = h;
+
+ /* caller ensures !xenfb_queue_full() */
+ xenfb_send_event(info, &event);
+}
+
+static void xenfb_do_resize(struct xenfb_info *info)
+{
+ union xenfb_out_event event;
+
+ memset(&event, 0, sizeof(event));
+ event.resize = info->resize;
+
+ /* caller ensures !xenfb_queue_full() */
+ xenfb_send_event(info, &event);
+}
+
+static int xenfb_queue_full(struct xenfb_info *info)
+{
+ __u32 cons, prod;
+
+ prod = info->page->out_prod;
+ cons = info->page->out_cons;
+ return prod - cons == XENFB_OUT_RING_LEN;
+}
+
+static void xenfb_update_screen(struct xenfb_info *info)
+{
+ unsigned long flags;
+ int y1, y2, x1, x2;
+ struct xenfb_mapping *map;
+
+ if (!info->update_wanted)
+ return;
+ if (xenfb_queue_full(info))
+ return;
+
+ mutex_lock(&info->mm_lock);
+
+ spin_lock_irqsave(&info->dirty_lock, flags);
+ y1 = info->y1;
+ y2 = info->y2;
+ x1 = info->x1;
+ x2 = info->x2;
+ info->x1 = info->y1 = INT_MAX;
+ info->x2 = info->y2 = 0;
+ spin_unlock_irqrestore(&info->dirty_lock, flags);
+
+ list_for_each_entry(map, &info->mappings, link) {
+ if (!map->faults)
+ continue;
+ zap_page_range(map->vma, map->vma->vm_start,
+ map->vma->vm_end - map->vma->vm_start, NULL);
+ map->faults = 0;
+ }
+
+ mutex_unlock(&info->mm_lock);
+
+ if (x2 < x1 || y2 < y1) {
+ printk("xenfb_update_screen bogus rect %d %d %d %d\n",
+ x1, x2, y1, y2);
+ WARN_ON(1);
+ }
+ xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1);
+}
+
+static void xenfb_handle_resize_dpy(struct xenfb_info *info)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&info->resize_lock, flags);
+ if (info->resize_dpy) {
+ if (!xenfb_queue_full(info)) {
+ info->resize_dpy = 0;
+ xenfb_do_resize(info);
+ }
+ }
+ spin_unlock_irqrestore(&info->resize_lock, flags);
+}
+
+static int xenfb_thread(void *data)
+{
+ struct xenfb_info *info = data;
+
+ while (!kthread_should_stop()) {
+ xenfb_handle_resize_dpy(info);
+ if (info->dirty) {
+ info->dirty = 0;
+ xenfb_update_screen(info);
+ }
+ wait_event_interruptible(info->wq,
+ kthread_should_stop() || info->dirty);
+ try_to_freeze();
+ }
+ return 0;
+}
+
+static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green,
+ unsigned blue, unsigned transp,
+ struct fb_info *info)
+{
+ u32 v;
+
+ if (regno > info->cmap.len)
+ return 1;
+
+ red >>= (16 - info->var.red.length);
+ green >>= (16 - info->var.green.length);
+ blue >>= (16 - info->var.blue.length);
+
+ v = (red << info->var.red.offset) |
+ (green << info->var.green.offset) |
+ (blue << info->var.blue.offset);
+
+ /* FIXME is this sane? check against xxxfb_setcolreg()! */
+ switch (info->var.bits_per_pixel) {
+ case 16:
+ case 24:
+ case 32:
+ ((u32 *)info->pseudo_palette)[regno] = v;
+ break;
+ }
+
+ return 0;
+}
+
+static void xenfb_timer(unsigned long data)
+{
+ struct xenfb_info *info = (struct xenfb_info *)data;
+ wake_up(&info->wq);
+}
+
+static void __xenfb_refresh(struct xenfb_info *info,
+ int x1, int y1, int w, int h)
+{
+ int y2, x2;
+
+ y2 = y1 + h;
+ x2 = x1 + w;
+
+ if (info->y1 > y1)
+ info->y1 = y1;
+ if (info->y2 < y2)
+ info->y2 = y2;
+ if (info->x1 > x1)
+ info->x1 = x1;
+ if (info->x2 < x2)
+ info->x2 = x2;
+ info->dirty = 1;
+
+ if (timer_pending(&info->refresh))
+ return;
+
+ mod_timer(&info->refresh, jiffies + HZ/xenfb_fps);
+}
+
+static void xenfb_refresh(struct xenfb_info *info,
+ int x1, int y1, int w, int h)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&info->dirty_lock, flags);
+ __xenfb_refresh(info, x1, y1, w, h);
+ spin_unlock_irqrestore(&info->dirty_lock, flags);
+}
+
+static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
+{
+ struct xenfb_info *info = p->par;
+
+ cfb_fillrect(p, rect);
+ xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height);
+}
+
+static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image)
+{
+ struct xenfb_info *info = p->par;
+
+ cfb_imageblit(p, image);
+ xenfb_refresh(info, image->dx, image->dy, image->width, image->height);
+}
+
+static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
+{
+ struct xenfb_info *info = p->par;
+
+ cfb_copyarea(p, area);
+ xenfb_refresh(info, area->dx, area->dy, area->width, area->height);
+}
+
+static void xenfb_vm_open(struct vm_area_struct *vma)
+{
+ struct xenfb_mapping *map = vma->vm_private_data;
+ atomic_inc(&map->map_refs);
+}
+
+static void xenfb_vm_close(struct vm_area_struct *vma)
+{
+ struct xenfb_mapping *map = vma->vm_private_data;
+ struct xenfb_info *info = map->info;
+
+ mutex_lock(&info->mm_lock);
+ if (atomic_dec_and_test(&map->map_refs)) {
+ list_del(&map->link);
+ kfree(map);
+ }
+ mutex_unlock(&info->mm_lock);
+}
+
+static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xenfb_mapping *map = vma->vm_private_data;
+ struct xenfb_info *info = map->info;
+ int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long flags;
+ struct page *page;
+ int y1, y2;
+
+ if (pgnr >= info->nr_pages)
+ return VM_FAULT_SIGBUS;
+
+ mutex_lock(&info->mm_lock);
+ spin_lock_irqsave(&info->dirty_lock, flags);
+ page = info->pages[pgnr];
+ get_page(page);
+ map->faults++;
+
+ y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length;
+ y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length;
+ if (y2 > info->fb_info->var.yres)
+ y2 = info->fb_info->var.yres;
+ __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1);
+ spin_unlock_irqrestore(&info->dirty_lock, flags);
+ mutex_unlock(&info->mm_lock);
+
+ vmf->page = page;
+
+ return VM_FAULT_MINOR;
+}
+
+static struct vm_operations_struct xenfb_vm_ops = {
+ .open = xenfb_vm_open,
+ .close = xenfb_vm_close,
+ .fault = xenfb_vm_fault,
+};
+
+static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
+{
+ struct xenfb_info *info = fb_info->par;
+ struct xenfb_mapping *map;
+ int map_pages;
+
+ if (!(vma->vm_flags & VM_WRITE))
+ return -EINVAL;
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT;
+ if (map_pages > info->nr_pages)
+ return -EINVAL;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (map == NULL)
+ return -ENOMEM;
+
+ map->vma = vma;
+ map->faults = 0;
+ map->info = info;
+ atomic_set(&map->map_refs, 1);
+
+ mutex_lock(&info->mm_lock);
+ list_add(&map->link, &info->mappings);
+ mutex_unlock(&info->mm_lock);
+
+ vma->vm_ops = &xenfb_vm_ops;
+ vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED);
+ vma->vm_private_data = map;
+
+ return 0;
+}
+
+static int
+xenfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ struct xenfb_info *xenfb_info;
+ int required_mem_len;
+
+ xenfb_info = info->par;
+
+ if (!xenfb_info->feature_resize) {
+ if (var->xres == video[KPARAM_WIDTH] &&
+ var->yres == video[KPARAM_HEIGHT] &&
+ var->bits_per_pixel == xenfb_info->page->depth) {
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ /* Can't resize past initial width and height */
+ if (var->xres > video[KPARAM_WIDTH] || var->yres > video[KPARAM_HEIGHT])
+ return -EINVAL;
+
+ required_mem_len = var->xres * var->yres * (xenfb_info->page->depth / 8);
+ if (var->bits_per_pixel == xenfb_info->page->depth &&
+ var->xres <= info->fix.line_length / (XENFB_DEPTH / 8) &&
+ required_mem_len <= info->fix.smem_len) {
+ var->xres_virtual = var->xres;
+ var->yres_virtual = var->yres;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static int xenfb_set_par(struct fb_info *info)
+{
+ struct xenfb_info *xenfb_info;
+ unsigned long flags;
+
+ xenfb_info = info->par;
+
+ spin_lock_irqsave(&xenfb_info->resize_lock, flags);
+ xenfb_info->resize.type = XENFB_TYPE_RESIZE;
+ xenfb_info->resize.width = info->var.xres;
+ xenfb_info->resize.height = info->var.yres;
+ xenfb_info->resize.stride = info->fix.line_length;
+ xenfb_info->resize.depth = info->var.bits_per_pixel;
+ xenfb_info->resize.offset = 0;
+ xenfb_info->resize_dpy = 1;
+ spin_unlock_irqrestore(&xenfb_info->resize_lock, flags);
+ return 0;
+}
+
+static struct fb_ops xenfb_fb_ops = {
+ .owner = THIS_MODULE,
+ .fb_setcolreg = xenfb_setcolreg,
+ .fb_fillrect = xenfb_fillrect,
+ .fb_copyarea = xenfb_copyarea,
+ .fb_imageblit = xenfb_imageblit,
+ .fb_mmap = xenfb_mmap,
+ .fb_check_var = xenfb_check_var,
+ .fb_set_par = xenfb_set_par,
+};
+
+static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
+{
+ /*
+ * No in events recognized, simply ignore them all.
+ * If you need to recognize some, see xenbkd's input_handler()
+ * for how to do that.
+ */
+ struct xenfb_info *info = dev_id;
+ struct xenfb_page *page = info->page;
+
+ if (page->in_cons != page->in_prod) {
+ info->page->in_cons = info->page->in_prod;
+ notify_remote_via_irq(info->irq);
+ }
+ return IRQ_HANDLED;
+}
+
+static unsigned long vmalloc_to_mfn(void *address)
+{
+ return pfn_to_mfn(vmalloc_to_pfn(address));
+}
+
+static int __devinit xenfb_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ struct xenfb_info *info;
+ struct fb_info *fb_info;
+ int fb_size;
+ int val;
+ int ret;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (info == NULL) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+ return -ENOMEM;
+ }
+
+ /* Limit kernel param videoram amount to what is in xenstore */
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "videoram", "%d", &val) == 1) {
+ if (val < video[KPARAM_MEM])
+ video[KPARAM_MEM] = val;
+ }
+
+ /* If requested res does not fit in available memory, use default */
+ fb_size = video[KPARAM_MEM] * MB_;
+ if (video[KPARAM_WIDTH] * video[KPARAM_HEIGHT] * XENFB_DEPTH/8 > fb_size) {
+ video[KPARAM_WIDTH] = XENFB_WIDTH;
+ video[KPARAM_HEIGHT] = XENFB_HEIGHT;
+ fb_size = XENFB_DEFAULT_FB_LEN;
+ }
+
+ dev->dev.driver_data = info;
+ info->xbdev = dev;
+ info->irq = -1;
+ info->x1 = info->y1 = INT_MAX;
+ spin_lock_init(&info->dirty_lock);
+ spin_lock_init(&info->resize_lock);
+ mutex_init(&info->mm_lock);
+ init_waitqueue_head(&info->wq);
+ init_timer(&info->refresh);
+ info->refresh.function = xenfb_timer;
+ info->refresh.data = (unsigned long)info;
+ INIT_LIST_HEAD(&info->mappings);
+
+ info->fb = vmalloc(fb_size);
+ if (info->fb == NULL)
+ goto error_nomem;
+ memset(info->fb, 0, fb_size);
+
+ info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ info->pages = kmalloc(sizeof(struct page *) * info->nr_pages,
+ GFP_KERNEL);
+ if (info->pages == NULL)
+ goto error_nomem;
+
+ info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
+ if (!info->mfns)
+ goto error_nomem;
+
+ /* set up shared page */
+ info->page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!info->page)
+ goto error_nomem;
+
+ fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL);
+ /* see fishy hackery below */
+ if (fb_info == NULL)
+ goto error_nomem;
+
+ /* FIXME fishy hackery */
+ fb_info->pseudo_palette = fb_info->par;
+ fb_info->par = info;
+ /* /FIXME */
+ fb_info->screen_base = info->fb;
+
+ fb_info->fbops = &xenfb_fb_ops;
+ fb_info->var.xres_virtual = fb_info->var.xres = video[KPARAM_WIDTH];
+ fb_info->var.yres_virtual = fb_info->var.yres = video[KPARAM_HEIGHT];
+ fb_info->var.bits_per_pixel = XENFB_DEPTH;
+
+ fb_info->var.red = (struct fb_bitfield){16, 8, 0};
+ fb_info->var.green = (struct fb_bitfield){8, 8, 0};
+ fb_info->var.blue = (struct fb_bitfield){0, 8, 0};
+
+ fb_info->var.activate = FB_ACTIVATE_NOW;
+ fb_info->var.height = -1;
+ fb_info->var.width = -1;
+ fb_info->var.vmode = FB_VMODE_NONINTERLACED;
+
+ fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
+ fb_info->fix.line_length = fb_info->var.xres * (XENFB_DEPTH / 8);
+ fb_info->fix.smem_start = 0;
+ fb_info->fix.smem_len = fb_size;
+ strcpy(fb_info->fix.id, "xen");
+ fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
+ fb_info->fix.accel = FB_ACCEL_NONE;
+
+ fb_info->flags = FBINFO_FLAG_DEFAULT;
+
+ ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
+ if (ret < 0) {
+ framebuffer_release(fb_info);
+ xenbus_dev_fatal(dev, ret, "fb_alloc_cmap");
+ goto error;
+ }
+
+ xenfb_init_shared_page(info, fb_info);
+
+ ret = register_framebuffer(fb_info);
+ if (ret) {
+ fb_dealloc_cmap(&info->fb_info->cmap);
+ framebuffer_release(fb_info);
+ xenbus_dev_fatal(dev, ret, "register_framebuffer");
+ goto error;
+ }
+ info->fb_info = fb_info;
+
+ /* FIXME should this be delayed until backend XenbusStateConnected? */
+ info->kthread = kthread_run(xenfb_thread, info, "xenfb thread");
+ if (IS_ERR(info->kthread)) {
+ ret = PTR_ERR(info->kthread);
+ info->kthread = NULL;
+ xenbus_dev_fatal(dev, ret, "register_framebuffer");
+ goto error;
+ }
+
+ ret = xenfb_connect_backend(dev, info);
+ if (ret < 0)
+ goto error;
+
+ return 0;
+
+ error_nomem:
+ ret = -ENOMEM;
+ xenbus_dev_fatal(dev, ret, "allocating device memory");
+ error:
+ xenfb_remove(dev);
+ return ret;
+}
+
+static int xenfb_resume(struct xenbus_device *dev)
+{
+ struct xenfb_info *info = dev->dev.driver_data;
+
+ xenfb_disconnect_backend(info);
+ xenfb_init_shared_page(info, info->fb_info);
+ return xenfb_connect_backend(dev, info);
+}
+
+static int xenfb_remove(struct xenbus_device *dev)
+{
+ struct xenfb_info *info = dev->dev.driver_data;
+
+ del_timer(&info->refresh);
+ if (info->kthread)
+ kthread_stop(info->kthread);
+ xenfb_disconnect_backend(info);
+ if (info->fb_info) {
+ unregister_framebuffer(info->fb_info);
+ fb_dealloc_cmap(&info->fb_info->cmap);
+ framebuffer_release(info->fb_info);
+ }
+ free_page((unsigned long)info->page);
+ vfree(info->mfns);
+ kfree(info->pages);
+ vfree(info->fb);
+ kfree(info);
+
+ return 0;
+}
+
+static void xenfb_init_shared_page(struct xenfb_info *info,
+ struct fb_info * fb_info)
+{
+ int i;
+ int epd = PAGE_SIZE / sizeof(info->mfns[0]);
+
+ for (i = 0; i < info->nr_pages; i++)
+ info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE);
+
+ for (i = 0; i < info->nr_pages; i++)
+ info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
+
+ for (i = 0; i * epd < info->nr_pages; i++)
+ info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]);
+
+ info->page->width = fb_info->var.xres;
+ info->page->height = fb_info->var.yres;
+ info->page->depth = fb_info->var.bits_per_pixel;
+ info->page->line_length = fb_info->fix.line_length;
+ info->page->mem_length = fb_info->fix.smem_len;
+ info->page->in_cons = info->page->in_prod = 0;
+ info->page->out_cons = info->page->out_prod = 0;
+}
+
+static int xenfb_connect_backend(struct xenbus_device *dev,
+ struct xenfb_info *info)
+{
+ int ret;
+ struct xenbus_transaction xbt;
+
+ ret = bind_listening_port_to_irqhandler(
+ dev->otherend_id, xenfb_event_handler, 0, "xenfb", info);
+ if (ret < 0) {
+ xenbus_dev_fatal(dev, ret,
+ "bind_listening_port_to_irqhandler");
+ return ret;
+ }
+ info->irq = ret;
+
+ again:
+ ret = xenbus_transaction_start(&xbt);
+ if (ret) {
+ xenbus_dev_fatal(dev, ret, "starting transaction");
+ return ret;
+ }
+ ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
+ virt_to_mfn(info->page));
+ if (ret)
+ goto error_xenbus;
+ ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+ irq_to_evtchn_port(info->irq));
+ if (ret)
+ goto error_xenbus;
+ ret = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+ XEN_IO_PROTO_ABI_NATIVE);
+ if (ret)
+ goto error_xenbus;
+ ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1");
+ if (ret)
+ goto error_xenbus;
+ ret = xenbus_transaction_end(xbt, 0);
+ if (ret) {
+ if (ret == -EAGAIN)
+ goto again;
+ xenbus_dev_fatal(dev, ret, "completing transaction");
+ return ret;
+ }
+
+ xenbus_switch_state(dev, XenbusStateInitialised);
+ return 0;
+
+ error_xenbus:
+ xenbus_transaction_end(xbt, 1);
+ xenbus_dev_fatal(dev, ret, "writing xenstore");
+ return ret;
+}
+
+static void xenfb_disconnect_backend(struct xenfb_info *info)
+{
+ if (info->irq >= 0)
+ unbind_from_irqhandler(info->irq, info);
+ info->irq = -1;
+}
+
+static void xenfb_backend_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+{
+ struct xenfb_info *info = dev->dev.driver_data;
+ int val;
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ break;
+
+ case XenbusStateInitWait:
+ InitWait:
+ xenbus_switch_state(dev, XenbusStateConnected);
+ break;
+
+ case XenbusStateConnected:
+ /*
+ * Work around xenbus race condition: If backend goes
+ * through InitWait to Connected fast enough, we can
+ * get Connected twice here.
+ */
+ if (dev->state != XenbusStateConnected)
+ goto InitWait; /* no InitWait seen yet, fudge it */
+
+ if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "request-update", "%d", &val) < 0)
+ val = 0;
+ if (val)
+ info->update_wanted = 1;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend,
+ "feature-resize", "%d", &val) < 0)
+ val = 0;
+ info->feature_resize = val;
+ break;
+
+ case XenbusStateClosing:
+ // FIXME is this safe in any dev->state?
+ xenbus_frontend_closed(dev);
+ break;
+ }
+}
+
+static const struct xenbus_device_id xenfb_ids[] = {
+ { "vfb" },
+ { "" }
+};
+MODULE_ALIAS("xen:vfb");
+
+static struct xenbus_driver xenfb_driver = {
+ .name = "vfb",
+ .ids = xenfb_ids,
+ .probe = xenfb_probe,
+ .remove = xenfb_remove,
+ .resume = xenfb_resume,
+ .otherend_changed = xenfb_backend_changed,
+};
+
+static int __init xenfb_init(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ /* Nothing to do if running in dom0. */
+ if (is_initial_xendomain())
+ return -ENODEV;
+
+ return xenbus_register_frontend(&xenfb_driver);
+}
+
+static void __exit xenfb_cleanup(void)
+{
+ return xenbus_unregister_driver(&xenfb_driver);
+}
+
+module_init(xenfb_init);
+module_exit(xenfb_cleanup);
+
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device
+ *
+ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
+ *
+ * Based on linux/drivers/input/mouse/sermouse.c
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive for
+ * more details.
+ */
+
+/*
+ * TODO:
+ *
+ * Switch to grant tables together with xenfb.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/input.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/fbif.h>
+#include <xen/interface/io/kbdif.h>
+#include <xen/xenbus.h>
+
+struct xenkbd_info
+{
+ struct input_dev *kbd;
+ struct input_dev *ptr;
+ struct xenkbd_page *page;
+ int irq;
+ struct xenbus_device *xbdev;
+ char phys[32];
+};
+
+static int xenkbd_remove(struct xenbus_device *);
+static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *);
+static void xenkbd_disconnect_backend(struct xenkbd_info *);
+
+/*
+ * Note: if you need to send out events, see xenfb_do_update() for how
+ * to do that.
+ */
+
+static irqreturn_t input_handler(int rq, void *dev_id)
+{
+ struct xenkbd_info *info = dev_id;
+ struct xenkbd_page *page = info->page;
+ __u32 cons, prod;
+
+ prod = page->in_prod;
+ if (prod == page->in_cons)
+ return IRQ_HANDLED;
+ rmb(); /* ensure we see ring contents up to prod */
+ for (cons = page->in_cons; cons != prod; cons++) {
+ union xenkbd_in_event *event;
+ struct input_dev *dev;
+ event = &XENKBD_IN_RING_REF(page, cons);
+
+ dev = info->ptr;
+ switch (event->type) {
+ case XENKBD_TYPE_MOTION:
+ if (event->motion.rel_z)
+ input_report_rel(dev, REL_WHEEL,
+ -event->motion.rel_z);
+ input_report_rel(dev, REL_X, event->motion.rel_x);
+ input_report_rel(dev, REL_Y, event->motion.rel_y);
+ break;
+ case XENKBD_TYPE_KEY:
+ dev = NULL;
+ if (test_bit(event->key.keycode, info->kbd->keybit))
+ dev = info->kbd;
+ if (test_bit(event->key.keycode, info->ptr->keybit))
+ dev = info->ptr;
+ if (dev)
+ input_report_key(dev, event->key.keycode,
+ event->key.pressed);
+ else
+ printk("xenkbd: unhandled keycode 0x%x\n",
+ event->key.keycode);
+ break;
+ case XENKBD_TYPE_POS:
+ if (event->pos.rel_z)
+ input_report_rel(dev, REL_WHEEL,
+ -event->pos.rel_z);
+ input_report_abs(dev, ABS_X, event->pos.abs_x);
+ input_report_abs(dev, ABS_Y, event->pos.abs_y);
+ break;
+ }
+ if (dev)
+ input_sync(dev);
+ }
+ mb(); /* ensure we got ring contents */
+ page->in_cons = cons;
+ notify_remote_via_irq(info->irq);
+
+ return IRQ_HANDLED;
+}
+
+int __devinit xenkbd_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int ret, i;
+ struct xenkbd_info *info;
+ struct input_dev *kbd, *ptr;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+ return -ENOMEM;
+ }
+ dev->dev.driver_data = info;
+ info->xbdev = dev;
+ snprintf(info->phys, sizeof(info->phys), "xenbus/%s", dev->nodename);
+
+ info->page = (void *)__get_free_page(GFP_KERNEL);
+ if (!info->page)
+ goto error_nomem;
+ info->page->in_cons = info->page->in_prod = 0;
+ info->page->out_cons = info->page->out_prod = 0;
+
+ /* keyboard */
+ kbd = input_allocate_device();
+ if (!kbd)
+ goto error_nomem;
+ kbd->name = "Xen Virtual Keyboard";
+ kbd->phys = info->phys;
+ kbd->id.bustype = BUS_PCI;
+ kbd->id.vendor = 0x5853;
+ kbd->id.product = 0xffff;
+ kbd->evbit[0] = BIT(EV_KEY);
+ for (i = KEY_ESC; i < KEY_UNKNOWN; i++)
+ set_bit(i, kbd->keybit);
+ for (i = KEY_OK; i < KEY_MAX; i++)
+ set_bit(i, kbd->keybit);
+
+ ret = input_register_device(kbd);
+ if (ret) {
+ input_free_device(kbd);
+ xenbus_dev_fatal(dev, ret, "input_register_device(kbd)");
+ goto error;
+ }
+ info->kbd = kbd;
+
+ /* pointing device */
+ ptr = input_allocate_device();
+ if (!ptr)
+ goto error_nomem;
+ ptr->name = "Xen Virtual Pointer";
+ ptr->phys = info->phys;
+ ptr->id.bustype = BUS_PCI;
+ ptr->id.vendor = 0x5853;
+ ptr->id.product = 0xfffe;
+ ptr->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS);
+ for (i = BTN_LEFT; i <= BTN_TASK; i++)
+ set_bit(i, ptr->keybit);
+ ptr->relbit[0] = BIT(REL_X) | BIT(REL_Y) | BIT(REL_WHEEL);
+ input_set_abs_params(ptr, ABS_X, 0, XENFB_WIDTH, 0, 0);
+ input_set_abs_params(ptr, ABS_Y, 0, XENFB_HEIGHT, 0, 0);
+
+ ret = input_register_device(ptr);
+ if (ret) {
+ input_free_device(ptr);
+ xenbus_dev_fatal(dev, ret, "input_register_device(ptr)");
+ goto error;
+ }
+ info->ptr = ptr;
+
+ ret = xenkbd_connect_backend(dev, info);
+ if (ret < 0)
+ goto error;
+
+ return 0;
+
+ error_nomem:
+ ret = -ENOMEM;
+ xenbus_dev_fatal(dev, ret, "allocating device memory");
+ error:
+ xenkbd_remove(dev);
+ return ret;
+}
+
+static int xenkbd_resume(struct xenbus_device *dev)
+{
+ struct xenkbd_info *info = dev->dev.driver_data;
+
+ xenkbd_disconnect_backend(info);
+ info->page->in_cons = info->page->in_prod = 0;
+ info->page->out_cons = info->page->out_prod = 0;
+ return xenkbd_connect_backend(dev, info);
+}
+
+static int xenkbd_remove(struct xenbus_device *dev)
+{
+ struct xenkbd_info *info = dev->dev.driver_data;
+
+ xenkbd_disconnect_backend(info);
+ input_unregister_device(info->kbd);
+ input_unregister_device(info->ptr);
+ free_page((unsigned long)info->page);
+ kfree(info);
+ return 0;
+}
+
+static int xenkbd_connect_backend(struct xenbus_device *dev,
+ struct xenkbd_info *info)
+{
+ int ret;
+ struct xenbus_transaction xbt;
+
+ ret = bind_listening_port_to_irqhandler(
+ dev->otherend_id, input_handler, 0, "xenkbd", info);
+ if (ret < 0) {
+ xenbus_dev_fatal(dev, ret,
+ "bind_listening_port_to_irqhandler");
+ return ret;
+ }
+ info->irq = ret;
+
+ again:
+ ret = xenbus_transaction_start(&xbt);
+ if (ret) {
+ xenbus_dev_fatal(dev, ret, "starting transaction");
+ return ret;
+ }
+ ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
+ virt_to_mfn(info->page));
+ if (ret)
+ goto error_xenbus;
+ ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+ irq_to_evtchn_port(info->irq));
+ if (ret)
+ goto error_xenbus;
+ ret = xenbus_transaction_end(xbt, 0);
+ if (ret) {
+ if (ret == -EAGAIN)
+ goto again;
+ xenbus_dev_fatal(dev, ret, "completing transaction");
+ return ret;
+ }
+
+ xenbus_switch_state(dev, XenbusStateInitialised);
+ return 0;
+
+ error_xenbus:
+ xenbus_transaction_end(xbt, 1);
+ xenbus_dev_fatal(dev, ret, "writing xenstore");
+ return ret;
+}
+
+static void xenkbd_disconnect_backend(struct xenkbd_info *info)
+{
+ if (info->irq >= 0)
+ unbind_from_irqhandler(info->irq, info);
+ info->irq = -1;
+}
+
+static void xenkbd_backend_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+{
+ struct xenkbd_info *info = dev->dev.driver_data;
+ int ret, val;
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ break;
+
+ case XenbusStateInitWait:
+ InitWait:
+ ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "feature-abs-pointer", "%d", &val);
+ if (ret < 0)
+ val = 0;
+ if (val) {
+ ret = xenbus_printf(XBT_NIL, info->xbdev->nodename,
+ "request-abs-pointer", "1");
+ if (ret)
+ ; /* FIXME */
+ }
+ xenbus_switch_state(dev, XenbusStateConnected);
+ break;
+
+ case XenbusStateConnected:
+ /*
+ * Work around xenbus race condition: If backend goes
+ * through InitWait to Connected fast enough, we can
+ * get Connected twice here.
+ */
+ if (dev->state != XenbusStateConnected)
+ goto InitWait; /* no InitWait seen yet, fudge it */
+
+ /* Set input abs params to match backend screen res */
+ if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "width", "%d", &val) > 0 )
+ input_set_abs_params(info->ptr, ABS_X, 0, val, 0, 0);
+
+ if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "height", "%d", &val) > 0 )
+ input_set_abs_params(info->ptr, ABS_Y, 0, val, 0, 0);
+
+ break;
+
+ case XenbusStateClosing:
+ xenbus_frontend_closed(dev);
+ break;
+ }
+}
+
+static const struct xenbus_device_id xenkbd_ids[] = {
+ { "vkbd" },
+ { "" }
+};
+MODULE_ALIAS("xen:vkbd");
+
+static struct xenbus_driver xenkbd_driver = {
+ .name = "vkbd",
+ .ids = xenkbd_ids,
+ .probe = xenkbd_probe,
+ .remove = xenkbd_remove,
+ .resume = xenkbd_resume,
+ .otherend_changed = xenkbd_backend_changed,
+};
+
+static int __init xenkbd_init(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ /* Nothing to do if running in dom0. */
+ if (is_initial_xendomain())
+ return -ENODEV;
+
+ return xenbus_register_frontend(&xenkbd_driver);
+}
+
+static void __exit xenkbd_cleanup(void)
+{
+ return xenbus_unregister_driver(&xenkbd_driver);
+}
+
+module_init(xenkbd_init);
+module_exit(xenkbd_cleanup);
+
+MODULE_LICENSE("GPL");
--- /dev/null
+obj-$(CONFIG_XEN_GRANT_DEV) := gntdev.o
--- /dev/null
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/atomic.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/driver_util.h>
+
+#include <linux/types.h>
+#include <xen/public/gntdev.h>
+
+
+#define DRIVER_AUTHOR "Derek G. Murray <Derek.Murray@cl.cam.ac.uk>"
+#define DRIVER_DESC "User-space granted page access driver"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
+#define MAX_GRANTS_LIMIT 1024
+#define DEFAULT_MAX_GRANTS 128
+
+/* A slot can be in one of three states:
+ *
+ * 0. GNTDEV_SLOT_INVALID:
+ * This slot is not associated with a grant reference, and is therefore free
+ * to be overwritten by a new grant reference.
+ *
+ * 1. GNTDEV_SLOT_NOT_YET_MAPPED:
+ * This slot is associated with a grant reference (via the
+ * IOCTL_GNTDEV_MAP_GRANT_REF ioctl), but it has not yet been mmap()-ed.
+ *
+ * 2. GNTDEV_SLOT_MAPPED:
+ * This slot is associated with a grant reference, and has been mmap()-ed.
+ */
+typedef enum gntdev_slot_state {
+ GNTDEV_SLOT_INVALID = 0,
+ GNTDEV_SLOT_NOT_YET_MAPPED,
+ GNTDEV_SLOT_MAPPED
+} gntdev_slot_state_t;
+
+#define GNTDEV_INVALID_HANDLE -1
+#define GNTDEV_FREE_LIST_INVALID -1
+/* Each opened instance of gntdev is associated with a list of grants,
+ * represented by an array of elements of the following type,
+ * gntdev_grant_info_t.
+ */
+typedef struct gntdev_grant_info {
+ gntdev_slot_state_t state;
+ union {
+ uint32_t free_list_index;
+ struct {
+ domid_t domid;
+ grant_ref_t ref;
+ grant_handle_t kernel_handle;
+ grant_handle_t user_handle;
+ uint64_t dev_bus_addr;
+ } valid;
+ } u;
+} gntdev_grant_info_t;
+
+/* Private data structure, which is stored in the file pointer for files
+ * associated with this device.
+ */
+typedef struct gntdev_file_private_data {
+
+ /* Array of grant information. */
+ gntdev_grant_info_t *grants;
+ uint32_t grants_size;
+
+ /* Read/write semaphore used to protect the grants array. */
+ struct rw_semaphore grants_sem;
+
+ /* An array of indices of free slots in the grants array.
+ * N.B. An entry in this list may temporarily have the value
+ * GNTDEV_FREE_LIST_INVALID if the corresponding slot has been removed
+ * from the list by the contiguous allocator, but the list has not yet
+ * been compressed. However, this is not visible across invocations of
+ * the device.
+ */
+ int32_t *free_list;
+
+ /* The number of free slots in the grants array. */
+ uint32_t free_list_size;
+
+ /* Read/write semaphore used to protect the free list. */
+ struct rw_semaphore free_list_sem;
+
+ /* Index of the next slot after the most recent contiguous allocation,
+ * for use in a next-fit allocator.
+ */
+ uint32_t next_fit_index;
+
+ /* Used to map grants into the kernel, before mapping them into user
+ * space.
+ */
+ struct page **foreign_pages;
+
+} gntdev_file_private_data_t;
+
+/* Module lifecycle operations. */
+static int __init gntdev_init(void);
+static void __exit gntdev_exit(void);
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* File operations. */
+static int gntdev_open(struct inode *inode, struct file *flip);
+static int gntdev_release(struct inode *inode, struct file *flip);
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma);
+static long gntdev_ioctl(struct file *flip,
+ unsigned int cmd, unsigned long arg);
+
+static const struct file_operations gntdev_fops = {
+ .owner = THIS_MODULE,
+ .open = gntdev_open,
+ .release = gntdev_release,
+ .mmap = gntdev_mmap,
+ .unlocked_ioctl = gntdev_ioctl
+};
+
+/* VM operations. */
+static void gntdev_vma_close(struct vm_area_struct *vma);
+static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, int is_fullmm);
+
+static struct vm_operations_struct gntdev_vmops = {
+ .close = gntdev_vma_close,
+ .zap_pte = gntdev_clear_pte
+};
+
+/* Global variables. */
+
+/* The driver major number, for use when unregistering the driver. */
+static int gntdev_major;
+
+#define GNTDEV_NAME "gntdev"
+
+/* Memory mapping functions
+ * ------------------------
+ *
+ * Every granted page is mapped into both kernel and user space, and the two
+ * following functions return the respective virtual addresses of these pages.
+ *
+ * When shadow paging is disabled, the granted page is mapped directly into
+ * user space; when it is enabled, it is mapped into the kernel and remapped
+ * into user space using vm_insert_page() (see gntdev_mmap(), below).
+ */
+
+/* Returns the virtual address (in user space) of the @page_index'th page
+ * in the given VM area.
+ */
+static inline unsigned long get_user_vaddr (struct vm_area_struct *vma,
+ int page_index)
+{
+ return (unsigned long) vma->vm_start + (page_index << PAGE_SHIFT);
+}
+
+/* Returns the virtual address (in kernel space) of the @slot_index'th page
+ * mapped by the gntdev instance that owns the given private data struct.
+ */
+static inline unsigned long get_kernel_vaddr (gntdev_file_private_data_t *priv,
+ int slot_index)
+{
+ unsigned long pfn;
+ void *kaddr;
+ pfn = page_to_pfn(priv->foreign_pages[slot_index]);
+ kaddr = pfn_to_kaddr(pfn);
+ return (unsigned long) kaddr;
+}
+
+/* Helper functions. */
+
+/* Adds information about a grant reference to the list of grants in the file's
+ * private data structure. Returns non-zero on failure. On success, sets the
+ * value of *offset to the offset that should be mmap()-ed in order to map the
+ * grant reference.
+ */
+static int add_grant_reference(struct file *flip,
+ struct ioctl_gntdev_grant_ref *op,
+ uint64_t *offset)
+{
+ gntdev_file_private_data_t *private_data
+ = (gntdev_file_private_data_t *) flip->private_data;
+
+ uint32_t slot_index;
+
+ if (unlikely(private_data->free_list_size == 0)) {
+ return -ENOMEM;
+ }
+
+ slot_index = private_data->free_list[--private_data->free_list_size];
+ private_data->free_list[private_data->free_list_size]
+ = GNTDEV_FREE_LIST_INVALID;
+
+ /* Copy the grant information into file's private data. */
+ private_data->grants[slot_index].state = GNTDEV_SLOT_NOT_YET_MAPPED;
+ private_data->grants[slot_index].u.valid.domid = op->domid;
+ private_data->grants[slot_index].u.valid.ref = op->ref;
+
+ /* The offset is calculated as the index of the chosen entry in the
+ * file's private data's array of grant information. This is then
+ * shifted to give an offset into the virtual "file address space".
+ */
+ *offset = slot_index << PAGE_SHIFT;
+
+ return 0;
+}
+
+/* Adds the @count grant references to the contiguous range in the slot array
+ * beginning at @first_slot. It is assumed that @first_slot was returned by a
+ * previous invocation of find_contiguous_free_range(), during the same
+ * invocation of the driver.
+ */
+static int add_grant_references(struct file *flip,
+ int count,
+ struct ioctl_gntdev_grant_ref *ops,
+ uint32_t first_slot)
+{
+ gntdev_file_private_data_t *private_data
+ = (gntdev_file_private_data_t *) flip->private_data;
+ int i;
+
+ for (i = 0; i < count; ++i) {
+
+ /* First, mark the slot's entry in the free list as invalid. */
+ int free_list_index =
+ private_data->grants[first_slot+i].u.free_list_index;
+ private_data->free_list[free_list_index] =
+ GNTDEV_FREE_LIST_INVALID;
+
+ /* Now, update the slot. */
+ private_data->grants[first_slot+i].state =
+ GNTDEV_SLOT_NOT_YET_MAPPED;
+ private_data->grants[first_slot+i].u.valid.domid =
+ ops[i].domid;
+ private_data->grants[first_slot+i].u.valid.ref = ops[i].ref;
+ }
+
+ return 0;
+}
+
+/* Scans through the free list for @flip, removing entries that are marked as
+ * GNTDEV_SLOT_INVALID. This will reduce the recorded size of the free list to
+ * the number of valid entries.
+ */
+static void compress_free_list(struct file *flip)
+{
+ gntdev_file_private_data_t *private_data
+ = (gntdev_file_private_data_t *) flip->private_data;
+ int i, j = 0, old_size, slot_index;
+
+ old_size = private_data->free_list_size;
+ for (i = 0; i < old_size; ++i) {
+ if (private_data->free_list[i] != GNTDEV_FREE_LIST_INVALID) {
+ if (i > j) {
+ slot_index = private_data->free_list[i];
+ private_data->free_list[j] = slot_index;
+ private_data->grants[slot_index].u
+ .free_list_index = j;
+ private_data->free_list[i]
+ = GNTDEV_FREE_LIST_INVALID;
+ }
+ ++j;
+ } else {
+ --private_data->free_list_size;
+ }
+ }
+}
+
+/* Searches the grant array in the private data of @flip for a range of
+ * @num_slots contiguous slots in the GNTDEV_SLOT_INVALID state.
+ *
+ * Returns the index of the first slot if a range is found, otherwise -ENOMEM.
+ */
+static int find_contiguous_free_range(struct file *flip,
+ uint32_t num_slots)
+{
+ gntdev_file_private_data_t *private_data
+ = (gntdev_file_private_data_t *) flip->private_data;
+
+ int i;
+ int start_index = private_data->next_fit_index;
+ int range_start = 0, range_length;
+
+ if (private_data->free_list_size < num_slots) {
+ return -ENOMEM;
+ }
+
+ /* First search from the start_index to the end of the array. */
+ range_length = 0;
+ for (i = start_index; i < private_data->grants_size; ++i) {
+ if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
+ if (range_length == 0) {
+ range_start = i;
+ }
+ ++range_length;
+ if (range_length == num_slots) {
+ return range_start;
+ }
+ }
+ }
+
+ /* Now search from the start of the array to the start_index. */
+ range_length = 0;
+ for (i = 0; i < start_index; ++i) {
+ if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
+ if (range_length == 0) {
+ range_start = i;
+ }
+ ++range_length;
+ if (range_length == num_slots) {
+ return range_start;
+ }
+ }
+ }
+
+ return -ENOMEM;
+}
+
+static int init_private_data(gntdev_file_private_data_t *priv,
+ uint32_t max_grants)
+{
+ int i;
+
+ /* Allocate space for the kernel-mapping of granted pages. */
+ priv->foreign_pages =
+ alloc_empty_pages_and_pagevec(max_grants);
+ if (!priv->foreign_pages)
+ goto nomem_out;
+
+ /* Allocate the grant list and free-list. */
+ priv->grants = kmalloc(max_grants * sizeof(gntdev_grant_info_t),
+ GFP_KERNEL);
+ if (!priv->grants)
+ goto nomem_out2;
+ priv->free_list = kmalloc(max_grants * sizeof(int32_t), GFP_KERNEL);
+ if (!priv->free_list)
+ goto nomem_out3;
+
+ /* Initialise the free-list, which contains all slots at first. */
+ for (i = 0; i < max_grants; ++i) {
+ priv->free_list[max_grants - i - 1] = i;
+ priv->grants[i].state = GNTDEV_SLOT_INVALID;
+ priv->grants[i].u.free_list_index = max_grants - i - 1;
+ }
+ priv->grants_size = max_grants;
+ priv->free_list_size = max_grants;
+ priv->next_fit_index = 0;
+
+ up_write(&priv->grants_sem);
+ up_write(&priv->free_list_sem);
+
+ return 0;
+
+nomem_out3:
+ kfree(priv->grants);
+nomem_out2:
+ free_empty_pages_and_pagevec(priv->foreign_pages, max_grants);
+nomem_out:
+ return -ENOMEM;
+
+}
+
+/* Interface functions. */
+
+/* Initialises the driver. Called when the module is loaded. */
+static int __init gntdev_init(void)
+{
+ struct class *class;
+ struct device *device;
+
+ if (!is_running_on_xen()) {
+ printk(KERN_ERR "You must be running Xen to use gntdev\n");
+ return -ENODEV;
+ }
+
+ gntdev_major = register_chrdev(0, GNTDEV_NAME, &gntdev_fops);
+ if (gntdev_major < 0)
+ {
+ printk(KERN_ERR "Could not register gntdev device\n");
+ return -ENOMEM;
+ }
+
+ /* Note that if the sysfs code fails, we will still initialise the
+ * device, and output the major number so that the device can be
+ * created manually using mknod.
+ */
+ if ((class = get_xen_class()) == NULL) {
+ printk(KERN_ERR "Error setting up xen_class\n");
+ printk(KERN_ERR "gntdev created with major number = %d\n",
+ gntdev_major);
+ return 0;
+ }
+
+ device = device_create(class, NULL, MKDEV(gntdev_major, 0),
+ GNTDEV_NAME);
+ if (IS_ERR(device)) {
+ printk(KERN_ERR "Error creating gntdev device in xen_class\n");
+ printk(KERN_ERR "gntdev created with major number = %d\n",
+ gntdev_major);
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Cleans up and unregisters the driver. Called when the driver is unloaded.
+ */
+static void __exit gntdev_exit(void)
+{
+ struct class *class;
+ if ((class = get_xen_class()) != NULL)
+ device_destroy(class, MKDEV(gntdev_major, 0));
+ unregister_chrdev(gntdev_major, GNTDEV_NAME);
+}
+
+/* Called when the device is opened. */
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+ gntdev_file_private_data_t *private_data;
+
+ try_module_get(THIS_MODULE);
+
+ /* Allocate space for the per-instance private data. */
+ private_data = kmalloc(sizeof(*private_data), GFP_KERNEL);
+ if (!private_data)
+ goto nomem_out;
+
+ /* These will be lazily initialised by init_private_data. */
+ private_data->grants = NULL;
+ private_data->free_list = NULL;
+ private_data->foreign_pages = NULL;
+
+ init_rwsem(&private_data->grants_sem);
+ init_rwsem(&private_data->free_list_sem);
+
+ flip->private_data = private_data;
+
+ return 0;
+
+nomem_out:
+ return -ENOMEM;
+}
+
+/* Called when the device is closed.
+ */
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+ if (flip->private_data) {
+ gntdev_file_private_data_t *private_data =
+ (gntdev_file_private_data_t *) flip->private_data;
+ if (private_data->foreign_pages)
+ free_empty_pages_and_pagevec
+ (private_data->foreign_pages,
+ private_data->grants_size);
+ if (private_data->grants)
+ kfree(private_data->grants);
+ if (private_data->free_list)
+ kfree(private_data->free_list);
+ kfree(private_data);
+ }
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+/* Called when an attempt is made to mmap() the device. The private data from
+ * @flip contains the list of grant references that can be mapped. The vm_pgoff
+ * field of @vma contains the index into that list that refers to the grant
+ * reference that will be mapped. Only mappings that are a multiple of
+ * PAGE_SIZE are handled.
+ */
+static int gntdev_mmap (struct file *flip, struct vm_area_struct *vma)
+{
+ struct gnttab_map_grant_ref op;
+ unsigned long slot_index = vma->vm_pgoff;
+ unsigned long kernel_vaddr, user_vaddr;
+ uint32_t size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ uint64_t ptep;
+ int ret;
+ int flags;
+ int i;
+ struct page *page;
+ gntdev_file_private_data_t *private_data = flip->private_data;
+
+ if (unlikely(!private_data)) {
+ printk(KERN_ERR "File's private data is NULL.\n");
+ return -EINVAL;
+ }
+
+ /* Test to make sure that the grants array has been initialised. */
+ down_read(&private_data->grants_sem);
+ if (unlikely(!private_data->grants)) {
+ up_read(&private_data->grants_sem);
+ printk(KERN_ERR "Attempted to mmap before ioctl.\n");
+ return -EINVAL;
+ }
+ up_read(&private_data->grants_sem);
+
+ if (unlikely((size <= 0) ||
+ (size + slot_index) > private_data->grants_size)) {
+ printk(KERN_ERR "Invalid number of pages or offset"
+ "(num_pages = %d, first_slot = %ld).\n",
+ size, slot_index);
+ return -ENXIO;
+ }
+
+ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ printk(KERN_ERR "Writable mappings must be shared.\n");
+ return -EINVAL;
+ }
+
+ /* Slots must be in the NOT_YET_MAPPED state. */
+ down_write(&private_data->grants_sem);
+ for (i = 0; i < size; ++i) {
+ if (private_data->grants[slot_index + i].state !=
+ GNTDEV_SLOT_NOT_YET_MAPPED) {
+ printk(KERN_ERR "Slot (index = %ld) is in the wrong "
+ "state (%d).\n", slot_index + i,
+ private_data->grants[slot_index + i].state);
+ up_write(&private_data->grants_sem);
+ return -EINVAL;
+ }
+ }
+
+ /* Install the hook for unmapping. */
+ vma->vm_ops = &gntdev_vmops;
+
+ /* The VM area contains pages from another VM. */
+ vma->vm_flags |= VM_FOREIGN;
+ vma->vm_private_data = kzalloc(size * sizeof(struct page *),
+ GFP_KERNEL);
+ if (vma->vm_private_data == NULL) {
+ printk(KERN_ERR "Couldn't allocate mapping structure for VM "
+ "area.\n");
+ return -ENOMEM;
+ }
+
+ /* This flag prevents Bad PTE errors when the memory is unmapped. */
+ vma->vm_flags |= VM_RESERVED;
+
+ /* This flag prevents this VM area being copied on a fork(). A better
+ * behaviour might be to explicitly carry out the appropriate mappings
+ * on fork(), but I don't know if there's a hook for this.
+ */
+ vma->vm_flags |= VM_DONTCOPY;
+
+#ifdef CONFIG_X86
+ /* This flag ensures that the page tables are not unpinned before the
+ * VM area is unmapped. Therefore Xen still recognises the PTE as
+ * belonging to an L1 pagetable, and the grant unmap operation will
+ * succeed, even if the process does not exit cleanly.
+ */
+ vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+ for (i = 0; i < size; ++i) {
+
+ flags = GNTMAP_host_map;
+ if (!(vma->vm_flags & VM_WRITE))
+ flags |= GNTMAP_readonly;
+
+ kernel_vaddr = get_kernel_vaddr(private_data, slot_index + i);
+ user_vaddr = get_user_vaddr(vma, i);
+ page = pfn_to_page(__pa(kernel_vaddr) >> PAGE_SHIFT);
+
+ gnttab_set_map_op(&op, kernel_vaddr, flags,
+ private_data->grants[slot_index+i]
+ .u.valid.ref,
+ private_data->grants[slot_index+i]
+ .u.valid.domid);
+
+ /* Carry out the mapping of the grant reference. */
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ &op, 1);
+ BUG_ON(ret);
+ if (op.status) {
+ printk(KERN_ERR "Error mapping the grant reference "
+ "into the kernel (%d). domid = %d; ref = %d\n",
+ op.status,
+ private_data->grants[slot_index+i]
+ .u.valid.domid,
+ private_data->grants[slot_index+i]
+ .u.valid.ref);
+ goto undo_map_out;
+ }
+
+ /* Store a reference to the page that will be mapped into user
+ * space.
+ */
+ ((struct page **) vma->vm_private_data)[i] = page;
+
+ /* Mark mapped page as reserved. */
+ SetPageReserved(page);
+
+ /* Record the grant handle, for use in the unmap operation. */
+ private_data->grants[slot_index+i].u.valid.kernel_handle =
+ op.handle;
+ private_data->grants[slot_index+i].u.valid.dev_bus_addr =
+ op.dev_bus_addr;
+
+ private_data->grants[slot_index+i].state = GNTDEV_SLOT_MAPPED;
+ private_data->grants[slot_index+i].u.valid.user_handle =
+ GNTDEV_INVALID_HANDLE;
+
+ /* Now perform the mapping to user space. */
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+
+ /* NOT USING SHADOW PAGE TABLES. */
+ /* In this case, we map the grant(s) straight into user
+ * space.
+ */
+
+ /* Get the machine address of the PTE for the user
+ * page.
+ */
+ if ((ret = create_lookup_pte_addr(vma->vm_mm,
+ vma->vm_start
+ + (i << PAGE_SHIFT),
+ &ptep)))
+ {
+ printk(KERN_ERR "Error obtaining PTE pointer "
+ "(%d).\n", ret);
+ goto undo_map_out;
+ }
+
+ /* Configure the map operation. */
+
+ /* The reference is to be used by host CPUs. */
+ flags = GNTMAP_host_map;
+
+ /* Specifies a user space mapping. */
+ flags |= GNTMAP_application_map;
+
+ /* The map request contains the machine address of the
+ * PTE to update.
+ */
+ flags |= GNTMAP_contains_pte;
+
+ if (!(vma->vm_flags & VM_WRITE))
+ flags |= GNTMAP_readonly;
+
+ gnttab_set_map_op(&op, ptep, flags,
+ private_data->grants[slot_index+i]
+ .u.valid.ref,
+ private_data->grants[slot_index+i]
+ .u.valid.domid);
+
+ /* Carry out the mapping of the grant reference. */
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ &op, 1);
+ BUG_ON(ret);
+ if (op.status) {
+ printk(KERN_ERR "Error mapping the grant "
+ "reference into user space (%d). domid "
+ "= %d; ref = %d\n", op.status,
+ private_data->grants[slot_index+i].u
+ .valid.domid,
+ private_data->grants[slot_index+i].u
+ .valid.ref);
+ goto undo_map_out;
+ }
+
+ /* Record the grant handle, for use in the unmap
+ * operation.
+ */
+ private_data->grants[slot_index+i].u.
+ valid.user_handle = op.handle;
+
+ /* Update p2m structure with the new mapping. */
+ set_phys_to_machine(__pa(kernel_vaddr) >> PAGE_SHIFT,
+ FOREIGN_FRAME(private_data->
+ grants[slot_index+i]
+ .u.valid.dev_bus_addr
+ >> PAGE_SHIFT));
+ } else {
+ /* USING SHADOW PAGE TABLES. */
+ /* In this case, we simply insert the page into the VM
+ * area. */
+ ret = vm_insert_page(vma, user_vaddr, page);
+ }
+
+ }
+
+ up_write(&private_data->grants_sem);
+ return 0;
+
+undo_map_out:
+ /* If we have a mapping failure, the unmapping will be taken care of
+ * by do_mmap_pgoff(), which will eventually call gntdev_clear_pte().
+ * All we need to do here is free the vma_private_data.
+ */
+ kfree(vma->vm_private_data);
+
+ /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
+ * to NULL on failure. However, we need this in gntdev_clear_pte() to
+ * unmap the grants. Therefore, we smuggle a reference to the file's
+ * private data in the VM area's private data pointer.
+ */
+ vma->vm_private_data = private_data;
+
+ up_write(&private_data->grants_sem);
+
+ return -ENOMEM;
+}
+
+static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, int is_fullmm)
+{
+ int slot_index, ret;
+ pte_t copy;
+ struct gnttab_unmap_grant_ref op;
+ gntdev_file_private_data_t *private_data;
+
+ /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
+ * to NULL on failure. However, we need this in gntdev_clear_pte() to
+ * unmap the grants. Therefore, we smuggle a reference to the file's
+ * private data in the VM area's private data pointer.
+ */
+ if (vma->vm_file) {
+ private_data = (gntdev_file_private_data_t *)
+ vma->vm_file->private_data;
+ } else if (vma->vm_private_data) {
+ private_data = (gntdev_file_private_data_t *)
+ vma->vm_private_data;
+ } else {
+ private_data = NULL; /* gcc warning */
+ BUG();
+ }
+
+ /* Calculate the grant relating to this PTE. */
+ slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+ /* Only unmap grants if the slot has been mapped. This could be being
+ * called from a failing mmap().
+ */
+ if (private_data->grants[slot_index].state == GNTDEV_SLOT_MAPPED) {
+
+ /* First, we clear the user space mapping, if it has been made.
+ */
+ if (private_data->grants[slot_index].u.valid.user_handle !=
+ GNTDEV_INVALID_HANDLE &&
+ !xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* NOT USING SHADOW PAGE TABLES. */
+
+ /* Copy the existing value of the PTE for returning. */
+ copy = *ptep;
+
+ gnttab_set_unmap_op(&op, virt_to_machine(ptep),
+ GNTMAP_contains_pte,
+ private_data->grants[slot_index]
+ .u.valid.user_handle);
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, &op, 1);
+ BUG_ON(ret);
+ if (op.status)
+ printk("User unmap grant status = %d\n",
+ op.status);
+ } else {
+ /* USING SHADOW PAGE TABLES. */
+ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+ }
+
+ /* Finally, we unmap the grant from kernel space. */
+ gnttab_set_unmap_op(&op,
+ get_kernel_vaddr(private_data, slot_index),
+ GNTMAP_host_map,
+ private_data->grants[slot_index].u.valid
+ .kernel_handle);
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ &op, 1);
+ BUG_ON(ret);
+ if (op.status)
+ printk("Kernel unmap grant status = %d\n", op.status);
+
+
+ /* Return slot to the not-yet-mapped state, so that it may be
+ * mapped again, or removed by a subsequent ioctl.
+ */
+ private_data->grants[slot_index].state =
+ GNTDEV_SLOT_NOT_YET_MAPPED;
+
+ /* Invalidate the physical to machine mapping for this page. */
+ set_phys_to_machine(__pa(get_kernel_vaddr(private_data,
+ slot_index))
+ >> PAGE_SHIFT, INVALID_P2M_ENTRY);
+
+ } else {
+ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+ }
+
+ return copy;
+}
+
+/* "Destructor" for a VM area.
+ */
+static void gntdev_vma_close(struct vm_area_struct *vma) {
+ if (vma->vm_private_data) {
+ kfree(vma->vm_private_data);
+ }
+}
+
+/* Called when an ioctl is made on the device.
+ */
+static long gntdev_ioctl(struct file *flip,
+ unsigned int cmd, unsigned long arg)
+{
+ int rc = 0;
+ gntdev_file_private_data_t *private_data =
+ (gntdev_file_private_data_t *) flip->private_data;
+
+ /* On the first invocation, we will lazily initialise the grant array
+ * and free-list.
+ */
+ if (unlikely(!private_data->grants)
+ && likely(cmd != IOCTL_GNTDEV_SET_MAX_GRANTS)) {
+ down_write(&private_data->grants_sem);
+
+ if (unlikely(private_data->grants)) {
+ up_write(&private_data->grants_sem);
+ goto private_data_initialised;
+ }
+
+ /* Just use the default. Setting to a non-default is handled
+ * in the ioctl switch.
+ */
+ rc = init_private_data(private_data, DEFAULT_MAX_GRANTS);
+
+ up_write(&private_data->grants_sem);
+
+ if (rc) {
+ printk (KERN_ERR "Initialising gntdev private data "
+ "failed.\n");
+ return rc;
+ }
+ }
+
+private_data_initialised:
+ switch (cmd) {
+ case IOCTL_GNTDEV_MAP_GRANT_REF:
+ {
+ struct ioctl_gntdev_map_grant_ref op;
+ down_write(&private_data->grants_sem);
+ down_write(&private_data->free_list_sem);
+
+ if ((rc = copy_from_user(&op, (void __user *) arg,
+ sizeof(op)))) {
+ rc = -EFAULT;
+ goto map_out;
+ }
+ if (unlikely(op.count <= 0)) {
+ rc = -EINVAL;
+ goto map_out;
+ }
+
+ if (op.count == 1) {
+ if ((rc = add_grant_reference(flip, &op.refs[0],
+ &op.index)) < 0) {
+ printk(KERN_ERR "Adding grant reference "
+ "failed (%d).\n", rc);
+ goto map_out;
+ }
+ } else {
+ struct ioctl_gntdev_grant_ref *refs, *u;
+ refs = kmalloc(op.count * sizeof(*refs), GFP_KERNEL);
+ if (!refs) {
+ rc = -ENOMEM;
+ goto map_out;
+ }
+ u = ((struct ioctl_gntdev_map_grant_ref *)arg)->refs;
+ if ((rc = copy_from_user(refs,
+ (void __user *)u,
+ sizeof(*refs) * op.count))) {
+ printk(KERN_ERR "Copying refs from user failed"
+ " (%d).\n", rc);
+ rc = -EINVAL;
+ goto map_out;
+ }
+ if ((rc = find_contiguous_free_range(flip, op.count))
+ < 0) {
+ printk(KERN_ERR "Finding contiguous range "
+ "failed (%d).\n", rc);
+ kfree(refs);
+ goto map_out;
+ }
+ op.index = rc << PAGE_SHIFT;
+ if ((rc = add_grant_references(flip, op.count,
+ refs, rc))) {
+ printk(KERN_ERR "Adding grant references "
+ "failed (%d).\n", rc);
+ kfree(refs);
+ goto map_out;
+ }
+ compress_free_list(flip);
+ kfree(refs);
+ }
+ if ((rc = copy_to_user((void __user *) arg,
+ &op,
+ sizeof(op)))) {
+ printk(KERN_ERR "Copying result back to user failed "
+ "(%d)\n", rc);
+ rc = -EFAULT;
+ goto map_out;
+ }
+ map_out:
+ up_write(&private_data->grants_sem);
+ up_write(&private_data->free_list_sem);
+ return rc;
+ }
+ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+ {
+ struct ioctl_gntdev_unmap_grant_ref op;
+ int i, start_index;
+
+ down_write(&private_data->grants_sem);
+ down_write(&private_data->free_list_sem);
+
+ if ((rc = copy_from_user(&op,
+ (void __user *) arg,
+ sizeof(op)))) {
+ rc = -EFAULT;
+ goto unmap_out;
+ }
+
+ start_index = op.index >> PAGE_SHIFT;
+
+ /* First, check that all pages are in the NOT_YET_MAPPED
+ * state.
+ */
+ for (i = 0; i < op.count; ++i) {
+ if (unlikely
+ (private_data->grants[start_index + i].state
+ != GNTDEV_SLOT_NOT_YET_MAPPED)) {
+ if (private_data->grants[start_index + i].state
+ == GNTDEV_SLOT_INVALID) {
+ printk(KERN_ERR
+ "Tried to remove an invalid "
+ "grant at offset 0x%x.",
+ (start_index + i)
+ << PAGE_SHIFT);
+ rc = -EINVAL;
+ } else {
+ printk(KERN_ERR
+ "Tried to remove a grant which "
+ "is currently mmap()-ed at "
+ "offset 0x%x.",
+ (start_index + i)
+ << PAGE_SHIFT);
+ rc = -EBUSY;
+ }
+ goto unmap_out;
+ }
+ }
+
+ /* Unmap pages and add them to the free list.
+ */
+ for (i = 0; i < op.count; ++i) {
+ private_data->grants[start_index+i].state =
+ GNTDEV_SLOT_INVALID;
+ private_data->grants[start_index+i].u.free_list_index =
+ private_data->free_list_size;
+ private_data->free_list[private_data->free_list_size] =
+ start_index + i;
+ ++private_data->free_list_size;
+ }
+
+ unmap_out:
+ up_write(&private_data->grants_sem);
+ up_write(&private_data->free_list_sem);
+ return rc;
+ }
+ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+ {
+ struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct vm_area_struct *vma;
+ unsigned long vaddr;
+
+ if ((rc = copy_from_user(&op,
+ (void __user *) arg,
+ sizeof(op)))) {
+ rc = -EFAULT;
+ goto get_offset_out;
+ }
+ vaddr = (unsigned long)op.vaddr;
+
+ down_read(¤t->mm->mmap_sem);
+ vma = find_vma(current->mm, vaddr);
+ if (vma == NULL) {
+ rc = -EFAULT;
+ goto get_offset_unlock_out;
+ }
+ if ((!vma->vm_ops) || (vma->vm_ops != &gntdev_vmops)) {
+ printk(KERN_ERR "The vaddr specified does not belong "
+ "to a gntdev instance: %#lx\n", vaddr);
+ rc = -EFAULT;
+ goto get_offset_unlock_out;
+ }
+ if (vma->vm_start != vaddr) {
+ printk(KERN_ERR "The vaddr specified in an "
+ "IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR must be at "
+ "the start of the VM area. vma->vm_start = "
+ "%#lx; vaddr = %#lx\n",
+ vma->vm_start, vaddr);
+ rc = -EFAULT;
+ goto get_offset_unlock_out;
+ }
+ op.offset = vma->vm_pgoff << PAGE_SHIFT;
+ op.count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ up_read(¤t->mm->mmap_sem);
+ if ((rc = copy_to_user((void __user *) arg,
+ &op,
+ sizeof(op)))) {
+ rc = -EFAULT;
+ goto get_offset_out;
+ }
+ goto get_offset_out;
+ get_offset_unlock_out:
+ up_read(¤t->mm->mmap_sem);
+ get_offset_out:
+ return rc;
+ }
+ case IOCTL_GNTDEV_SET_MAX_GRANTS:
+ {
+ struct ioctl_gntdev_set_max_grants op;
+ if ((rc = copy_from_user(&op,
+ (void __user *) arg,
+ sizeof(op)))) {
+ rc = -EFAULT;
+ goto set_max_out;
+ }
+ down_write(&private_data->grants_sem);
+ if (private_data->grants) {
+ rc = -EBUSY;
+ goto set_max_unlock_out;
+ }
+ if (op.count > MAX_GRANTS_LIMIT) {
+ rc = -EINVAL;
+ goto set_max_unlock_out;
+ }
+ rc = init_private_data(private_data, op.count);
+ set_max_unlock_out:
+ up_write(&private_data->grants_sem);
+ set_max_out:
+ return rc;
+ }
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
--- /dev/null
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
+obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
+
+netbk-y := netback.o xenbus.o interface.o accel.o
+netloop-y := loopback.o
--- /dev/null
+/******************************************************************************
+ * drivers/xen/netback/accel.c
+ *
+ * Interface between backend virtual network device and accelerated plugin.
+ *
+ * Copyright (C) 2007 Solarflare Communications, Inc
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <xen/xenbus.h>
+#include <linux/mutex.h>
+
+#include "common.h"
+
+#if 0
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+ printk("netback/accel (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+#endif
+
+/*
+ * A list of available netback accelerator plugin modules (each list
+ * entry is of type struct netback_accelerator)
+ */
+static struct list_head accelerators_list;
+/* Lock used to protect access to accelerators_list */
+DEFINE_MUTEX(accelerators_mutex);
+
+/*
+ * Compare a backend to an accelerator, and decide if they are
+ * compatible (i.e. if the accelerator should be used by the
+ * backend)
+ */
+static int match_accelerator(struct xenbus_device *xendev,
+ struct backend_info *be,
+ struct netback_accelerator *accelerator)
+{
+ int rc = 0;
+ char *eth_name = xenbus_read(XBT_NIL, xendev->nodename, "accel", NULL);
+
+ if (IS_ERR(eth_name)) {
+ /* Probably means not present */
+ DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
+ __FUNCTION__, PTR_ERR(eth_name));
+ return 0;
+ } else {
+ if (!strcmp(eth_name, accelerator->eth_name))
+ rc = 1;
+ kfree(eth_name);
+ return rc;
+ }
+}
+
+
+static void do_probe(struct backend_info *be,
+ struct netback_accelerator *accelerator,
+ struct xenbus_device *xendev)
+{
+ be->accelerator = accelerator;
+ atomic_inc(&be->accelerator->use_count);
+ if (be->accelerator->hooks->probe(xendev) != 0) {
+ atomic_dec(&be->accelerator->use_count);
+ module_put(be->accelerator->hooks->owner);
+ be->accelerator = NULL;
+ }
+}
+
+
+/*
+ * Notify suitable backends that a new accelerator is available and
+ * connected. This will also notify the accelerator plugin module
+ * that it is being used for a device through the probe hook.
+ */
+static int netback_accelerator_probe_backend(struct device *dev, void *arg)
+{
+ struct netback_accelerator *accelerator =
+ (struct netback_accelerator *)arg;
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+
+ if (!strcmp("vif", xendev->devicetype)) {
+ struct backend_info *be = xendev->dev.driver_data;
+
+ if (match_accelerator(xendev, be, accelerator) &&
+ try_module_get(accelerator->hooks->owner)) {
+ do_probe(be, accelerator, xendev);
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Notify suitable backends that an accelerator is unavailable.
+ */
+static int netback_accelerator_remove_backend(struct device *dev, void *arg)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct netback_accelerator *accelerator =
+ (struct netback_accelerator *)arg;
+
+ if (!strcmp("vif", xendev->devicetype)) {
+ struct backend_info *be = xendev->dev.driver_data;
+
+ if (be->accelerator == accelerator) {
+ be->accelerator->hooks->remove(xendev);
+ atomic_dec(&be->accelerator->use_count);
+ module_put(be->accelerator->hooks->owner);
+ be->accelerator = NULL;
+ }
+ }
+ return 0;
+}
+
+
+
+/*
+ * Entry point for an netback accelerator plugin module. Called to
+ * advertise its presence, and connect to any suitable backends.
+ */
+int netback_connect_accelerator(unsigned version, int id, const char *eth_name,
+ struct netback_accel_hooks *hooks)
+{
+ struct netback_accelerator *new_accelerator;
+ unsigned eth_name_len;
+
+ if (version != NETBACK_ACCEL_VERSION) {
+ if (version > NETBACK_ACCEL_VERSION) {
+ /* Caller has higher version number, leave it
+ up to them to decide whether to continue.
+ They can recall with a lower number if
+ they're happy to be compatible with us */
+ return NETBACK_ACCEL_VERSION;
+ } else {
+ /* We have a more recent version than caller.
+ Currently reject, but may in future be able
+ to be backwardly compatible */
+ return -EPROTO;
+ }
+ }
+
+ new_accelerator =
+ kmalloc(sizeof(struct netback_accelerator), GFP_KERNEL);
+ if (!new_accelerator) {
+ DPRINTK("%s: failed to allocate memory for accelerator\n",
+ __FUNCTION__);
+ return -ENOMEM;
+ }
+
+ new_accelerator->id = id;
+
+ eth_name_len = strlen(eth_name)+1;
+ new_accelerator->eth_name = kmalloc(eth_name_len, GFP_KERNEL);
+ if (!new_accelerator->eth_name) {
+ DPRINTK("%s: failed to allocate memory for eth_name string\n",
+ __FUNCTION__);
+ kfree(new_accelerator);
+ return -ENOMEM;
+ }
+ strlcpy(new_accelerator->eth_name, eth_name, eth_name_len);
+
+ new_accelerator->hooks = hooks;
+
+ atomic_set(&new_accelerator->use_count, 0);
+
+ mutex_lock(&accelerators_mutex);
+ list_add(&new_accelerator->link, &accelerators_list);
+
+ /* tell existing backends about new plugin */
+ xenbus_for_each_backend(new_accelerator,
+ netback_accelerator_probe_backend);
+
+ mutex_unlock(&accelerators_mutex);
+
+ return 0;
+
+}
+EXPORT_SYMBOL_GPL(netback_connect_accelerator);
+
+
+/*
+ * Disconnect an accelerator plugin module that has previously been
+ * connected.
+ */
+void netback_disconnect_accelerator(int id, const char *eth_name)
+{
+ struct netback_accelerator *accelerator, *next;
+
+ mutex_lock(&accelerators_mutex);
+ list_for_each_entry_safe(accelerator, next, &accelerators_list, link) {
+ if (!strcmp(eth_name, accelerator->eth_name)) {
+ xenbus_for_each_backend
+ (accelerator, netback_accelerator_remove_backend);
+ BUG_ON(atomic_read(&accelerator->use_count) != 0);
+ list_del(&accelerator->link);
+ kfree(accelerator->eth_name);
+ kfree(accelerator);
+ break;
+ }
+ }
+ mutex_unlock(&accelerators_mutex);
+}
+EXPORT_SYMBOL_GPL(netback_disconnect_accelerator);
+
+
+void netback_probe_accelerators(struct backend_info *be,
+ struct xenbus_device *dev)
+{
+ struct netback_accelerator *accelerator;
+
+ /*
+ * Check list of accelerators to see if any is suitable, and
+ * use it if it is.
+ */
+ mutex_lock(&accelerators_mutex);
+ list_for_each_entry(accelerator, &accelerators_list, link) {
+ if (match_accelerator(dev, be, accelerator) &&
+ try_module_get(accelerator->hooks->owner)) {
+ do_probe(be, accelerator, dev);
+ break;
+ }
+ }
+ mutex_unlock(&accelerators_mutex);
+}
+
+
+void netback_remove_accelerators(struct backend_info *be,
+ struct xenbus_device *dev)
+{
+ mutex_lock(&accelerators_mutex);
+ /* Notify the accelerator (if any) of this device's removal */
+ if (be->accelerator != NULL) {
+ be->accelerator->hooks->remove(dev);
+ atomic_dec(&be->accelerator->use_count);
+ module_put(be->accelerator->hooks->owner);
+ be->accelerator = NULL;
+ }
+ mutex_unlock(&accelerators_mutex);
+}
+
+
+void netif_accel_init(void)
+{
+ INIT_LIST_HEAD(&accelerators_list);
+}
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/wait.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/netif.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+#include <xen/driver_util.h>
+#include <xen/xenbus.h>
+
+#define DPRINTK(_f, _a...) \
+ pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+#define IPRINTK(fmt, args...) \
+ printk(KERN_INFO "xen_net: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+ printk(KERN_WARNING "xen_net: " fmt, ##args)
+
+typedef struct netif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+
+ u8 fe_dev_addr[6];
+
+ /* Physical parameters of the comms window. */
+ grant_handle_t tx_shmem_handle;
+ grant_ref_t tx_shmem_ref;
+ grant_handle_t rx_shmem_handle;
+ grant_ref_t rx_shmem_ref;
+ unsigned int irq;
+
+ /* The shared rings and indexes. */
+ netif_tx_back_ring_t tx;
+ netif_rx_back_ring_t rx;
+ struct vm_struct *tx_comms_area;
+ struct vm_struct *rx_comms_area;
+
+ /* Set of features that can be turned on in dev->features. */
+ int features;
+
+ /* Internal feature information. */
+ u8 can_queue:1; /* can queue packets for receiver? */
+ u8 copying_receiver:1; /* copy packets to receiver? */
+
+ /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
+ RING_IDX rx_req_cons_peek;
+
+ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+ unsigned long credit_bytes;
+ unsigned long credit_usec;
+ unsigned long remaining_credit;
+ struct timer_list credit_timeout;
+
+ /* Enforce draining of the transmit queue. */
+ struct timer_list tx_queue_timeout;
+
+ /* Miscellaneous private stuff. */
+ struct list_head list; /* scheduling list */
+ atomic_t refcnt;
+ struct net_device *dev;
+ struct net_device_stats stats;
+
+ unsigned int carrier;
+
+ wait_queue_head_t waiting_to_free;
+} netif_t;
+
+/*
+ * Implement our own carrier flag: the network stack's version causes delays
+ * when the carrier is re-enabled (in particular, dev_activate() may not
+ * immediately be called, which can cause packet loss; also the etherbridge
+ * can be rather lazy in activating its port).
+ */
+#define netback_carrier_on(netif) ((netif)->carrier = 1)
+#define netback_carrier_off(netif) ((netif)->carrier = 0)
+#define netback_carrier_ok(netif) ((netif)->carrier)
+
+enum {
+ NETBK_DONT_COPY_SKB,
+ NETBK_DELAYED_COPY_SKB,
+ NETBK_ALWAYS_COPY_SKB,
+};
+
+extern int netbk_copy_skb_mode;
+
+/* Function pointers into netback accelerator plugin modules */
+struct netback_accel_hooks {
+ struct module *owner;
+ int (*probe)(struct xenbus_device *dev);
+ int (*remove)(struct xenbus_device *dev);
+};
+
+/* Structure to track the state of a netback accelerator plugin */
+struct netback_accelerator {
+ struct list_head link;
+ int id;
+ char *eth_name;
+ atomic_t use_count;
+ struct netback_accel_hooks *hooks;
+};
+
+struct backend_info {
+ struct xenbus_device *dev;
+ netif_t *netif;
+ enum xenbus_state frontend_state;
+
+ /* State relating to the netback accelerator */
+ void *netback_accel_priv;
+ /* The accelerator that this backend is currently using */
+ struct netback_accelerator *accelerator;
+};
+
+#define NETBACK_ACCEL_VERSION 0x00010001
+
+/*
+ * Connect an accelerator plugin module to netback. Returns zero on
+ * success, < 0 on error, > 0 (with highest version number supported)
+ * if version mismatch.
+ */
+extern int netback_connect_accelerator(unsigned version,
+ int id, const char *eth_name,
+ struct netback_accel_hooks *hooks);
+/* Disconnect a previously connected accelerator plugin module */
+extern void netback_disconnect_accelerator(int id, const char *eth_name);
+
+
+extern
+void netback_probe_accelerators(struct backend_info *be,
+ struct xenbus_device *dev);
+extern
+void netback_remove_accelerators(struct backend_info *be,
+ struct xenbus_device *dev);
+extern
+void netif_accel_init(void);
+
+
+#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
+#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
+
+void netif_disconnect(netif_t *netif);
+
+netif_t *netif_alloc(domid_t domid, unsigned int handle);
+int netif_map(netif_t *netif, unsigned long tx_ring_ref,
+ unsigned long rx_ring_ref, unsigned int evtchn);
+
+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define netif_put(_b) \
+ do { \
+ if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+ wake_up(&(_b)->waiting_to_free); \
+ } while (0)
+
+void netif_xenbus_init(void);
+
+#define netif_schedulable(netif) \
+ (netif_running((netif)->dev) && netback_carrier_ok(netif))
+
+void netif_schedule_work(netif_t *netif);
+void netif_deschedule_work(netif_t *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
+irqreturn_t netif_be_int(int irq, void *dev_id);
+
+static inline int netbk_can_queue(struct net_device *dev)
+{
+ netif_t *netif = netdev_priv(dev);
+ return netif->can_queue;
+}
+
+static inline int netbk_can_sg(struct net_device *dev)
+{
+ netif_t *netif = netdev_priv(dev);
+ return netif->features & NETIF_F_SG;
+}
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ *
+ * Network-device interface management.
+ *
+ * Copyright (c) 2004-2005, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+
+/*
+ * Module parameter 'queue_length':
+ *
+ * Enables queuing in the network stack when a client has run out of receive
+ * descriptors. Although this feature can improve receive bandwidth by avoiding
+ * packet loss, it can also result in packets sitting in the 'tx_queue' for
+ * unbounded time. This is bad if those packets hold onto foreign resources.
+ * For example, consider a packet that holds onto resources belonging to the
+ * guest for which it is queued (e.g., packet received on vif1.0, destined for
+ * vif1.1 which is not activated in the guest): in this situation the guest
+ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
+ * run a timer (tx_queue_timeout) to drain the queue when the interface is
+ * blocked.
+ */
+static unsigned long netbk_queue_length = 32;
+module_param_named(queue_length, netbk_queue_length, ulong, 0);
+
+static void __netif_up(netif_t *netif)
+{
+ enable_irq(netif->irq);
+ netif_schedule_work(netif);
+}
+
+static void __netif_down(netif_t *netif)
+{
+ disable_irq(netif->irq);
+ netif_deschedule_work(netif);
+}
+
+static int net_open(struct net_device *dev)
+{
+ netif_t *netif = netdev_priv(dev);
+ if (netback_carrier_ok(netif)) {
+ __netif_up(netif);
+ netif_start_queue(dev);
+ }
+ return 0;
+}
+
+static int net_close(struct net_device *dev)
+{
+ netif_t *netif = netdev_priv(dev);
+ if (netback_carrier_ok(netif))
+ __netif_down(netif);
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static int netbk_change_mtu(struct net_device *dev, int mtu)
+{
+ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+ if (mtu > max)
+ return -EINVAL;
+ dev->mtu = mtu;
+ return 0;
+}
+
+static int netbk_set_sg(struct net_device *dev, u32 data)
+{
+ if (data) {
+ netif_t *netif = netdev_priv(dev);
+
+ if (!(netif->features & NETIF_F_SG))
+ return -ENOSYS;
+ }
+
+ return ethtool_op_set_sg(dev, data);
+}
+
+static int netbk_set_tso(struct net_device *dev, u32 data)
+{
+ if (data) {
+ netif_t *netif = netdev_priv(dev);
+
+ if (!(netif->features & NETIF_F_TSO))
+ return -ENOSYS;
+ }
+
+ return ethtool_op_set_tso(dev, data);
+}
+
+static struct ethtool_ops network_ethtool_ops =
+{
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = ethtool_op_set_tx_csum,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = netbk_set_sg,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = netbk_set_tso,
+ .get_link = ethtool_op_get_link,
+};
+
+netif_t *netif_alloc(domid_t domid, unsigned int handle)
+{
+ int err = 0;
+ struct net_device *dev;
+ netif_t *netif;
+ char name[IFNAMSIZ] = {};
+
+ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+ dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
+ if (dev == NULL) {
+ DPRINTK("Could not create netif: out of memory\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ netif = netdev_priv(dev);
+ memset(netif, 0, sizeof(*netif));
+ netif->domid = domid;
+ netif->handle = handle;
+ atomic_set(&netif->refcnt, 1);
+ init_waitqueue_head(&netif->waiting_to_free);
+ netif->dev = dev;
+
+ netback_carrier_off(netif);
+
+ netif->credit_bytes = netif->remaining_credit = ~0UL;
+ netif->credit_usec = 0UL;
+ init_timer(&netif->credit_timeout);
+ /* Initialize 'expires' now: it's used to track the credit window. */
+ netif->credit_timeout.expires = jiffies;
+
+ init_timer(&netif->tx_queue_timeout);
+
+ dev->hard_start_xmit = netif_be_start_xmit;
+ dev->get_stats = netif_be_get_stats;
+ dev->open = net_open;
+ dev->stop = net_close;
+ dev->change_mtu = netbk_change_mtu;
+ dev->features = NETIF_F_IP_CSUM;
+
+ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
+ dev->tx_queue_len = netbk_queue_length;
+
+ /*
+ * Initialise a dummy MAC address. We choose the numerically
+ * largest non-broadcast address to prevent the address getting
+ * stolen by an Ethernet bridge for STP purposes.
+ * (FE:FF:FF:FF:FF:FF)
+ */
+ memset(dev->dev_addr, 0xFF, ETH_ALEN);
+ dev->dev_addr[0] &= ~0x01;
+
+ rtnl_lock();
+ err = register_netdevice(dev);
+ rtnl_unlock();
+ if (err) {
+ DPRINTK("Could not register new net device %s: err=%d\n",
+ dev->name, err);
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+
+ DPRINTK("Successfully created netif\n");
+ return netif;
+}
+
+static int map_frontend_pages(
+ netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
+ GNTMAP_host_map, tx_ring_ref, netif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
+ return op.status;
+ }
+
+ netif->tx_shmem_ref = tx_ring_ref;
+ netif->tx_shmem_handle = op.handle;
+
+ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
+ GNTMAP_host_map, rx_ring_ref, netif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
+ return op.status;
+ }
+
+ netif->rx_shmem_ref = rx_ring_ref;
+ netif->rx_shmem_handle = op.handle;
+
+ return 0;
+}
+
+static void unmap_frontend_pages(netif_t *netif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
+ GNTMAP_host_map, netif->tx_shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+
+ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
+ GNTMAP_host_map, netif->rx_shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+}
+
+int netif_map(netif_t *netif, unsigned long tx_ring_ref,
+ unsigned long rx_ring_ref, unsigned int evtchn)
+{
+ int err = -ENOMEM;
+ netif_tx_sring_t *txs;
+ netif_rx_sring_t *rxs;
+
+ /* Already connected through? */
+ if (netif->irq)
+ return 0;
+
+ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
+ if (netif->tx_comms_area == NULL)
+ return -ENOMEM;
+ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
+ if (netif->rx_comms_area == NULL)
+ goto err_rx;
+
+ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
+ if (err)
+ goto err_map;
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ netif->domid, evtchn, netif_be_int, 0,
+ netif->dev->name, netif);
+ if (err < 0)
+ goto err_hypervisor;
+ netif->irq = err;
+ disable_irq(netif->irq);
+
+ txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
+ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
+
+ rxs = (netif_rx_sring_t *)
+ ((char *)netif->rx_comms_area->addr);
+ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
+
+ netif->rx_req_cons_peek = 0;
+
+ netif_get(netif);
+
+ rtnl_lock();
+ netback_carrier_on(netif);
+ if (netif_running(netif->dev))
+ __netif_up(netif);
+ rtnl_unlock();
+
+ return 0;
+err_hypervisor:
+ unmap_frontend_pages(netif);
+err_map:
+ free_vm_area(netif->rx_comms_area);
+err_rx:
+ free_vm_area(netif->tx_comms_area);
+ return err;
+}
+
+void netif_disconnect(netif_t *netif)
+{
+ if (netback_carrier_ok(netif)) {
+ rtnl_lock();
+ netback_carrier_off(netif);
+ netif_carrier_off(netif->dev); /* discard queued packets */
+ if (netif_running(netif->dev))
+ __netif_down(netif);
+ rtnl_unlock();
+ netif_put(netif);
+ }
+
+ atomic_dec(&netif->refcnt);
+ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
+
+ del_timer_sync(&netif->credit_timeout);
+ del_timer_sync(&netif->tx_queue_timeout);
+
+ if (netif->irq)
+ unbind_from_irqhandler(netif->irq, netif);
+
+ unregister_netdev(netif->dev);
+
+ if (netif->tx.sring) {
+ unmap_frontend_pages(netif);
+ free_vm_area(netif->tx_comms_area);
+ free_vm_area(netif->rx_comms_area);
+ }
+
+ free_netdev(netif->dev);
+}
--- /dev/null
+/******************************************************************************
+ * netback/loopback.c
+ *
+ * A two-interface loopback device to emulate a local netfront-netback
+ * connection. This ensures that local packet delivery looks identical
+ * to inter-domain delivery. Most importantly, packets delivered locally
+ * originating from other domains will get *copied* when they traverse this
+ * driver. This prevents unbounded delays in socket-buffer queues from
+ * causing the netback driver to "seize up".
+ *
+ * This driver creates a symmetric pair of loopback interfaces with names
+ * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
+ * bridge, just like a proper netback interface, while a local IP interface
+ * is configured on 'veth0'.
+ *
+ * As with a real netback interface, vif0.0 is configured with a suitable
+ * dummy MAC address. No default is provided for veth0: a reasonable strategy
+ * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
+ * (to avoid confusing the Etherbridge).
+ *
+ * Copyright (c) 2005 K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/ethtool.h>
+#include <net/dst.h>
+#include <net/xfrm.h> /* secpath_reset() */
+#include <asm/hypervisor.h> /* is_initial_xendomain() */
+#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */
+
+static int nloopbacks = -1;
+module_param(nloopbacks, int, 0);
+MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
+
+struct net_private {
+ struct net_device *loopback_dev;
+ struct net_device_stats stats;
+};
+
+static int loopback_open(struct net_device *dev)
+{
+ struct net_private *np = netdev_priv(dev);
+ memset(&np->stats, 0, sizeof(np->stats));
+ netif_start_queue(dev);
+ return 0;
+}
+
+static int loopback_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ return 0;
+}
+
+#ifdef CONFIG_X86
+static int is_foreign(unsigned long pfn)
+{
+ /* NB. Play it safe for auto-translation mode. */
+ return (xen_feature(XENFEAT_auto_translated_physmap) ||
+ (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT));
+}
+#else
+/* How to detect a foreign mapping? Play it safe. */
+#define is_foreign(pfn) (1)
+#endif
+
+static int skb_remove_foreign_references(struct sk_buff *skb)
+{
+ struct page *page;
+ unsigned long pfn;
+ int i, off;
+ char *vaddr;
+
+ BUG_ON(skb_shinfo(skb)->frag_list);
+
+ if (skb_cloned(skb) &&
+ unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page);
+ if (!is_foreign(pfn))
+ continue;
+
+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!page))
+ return 0;
+
+ vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ off = skb_shinfo(skb)->frags[i].page_offset;
+ memcpy(page_address(page) + off,
+ vaddr + off,
+ skb_shinfo(skb)->frags[i].size);
+ kunmap_skb_frag(vaddr);
+
+ put_page(skb_shinfo(skb)->frags[i].page);
+ skb_shinfo(skb)->frags[i].page = page;
+ }
+
+ return 1;
+}
+
+static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_private *np = netdev_priv(dev);
+
+ if (!skb_remove_foreign_references(skb)) {
+ np->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return 0;
+ }
+
+ dst_release(skb->dst);
+ skb->dst = NULL;
+
+ skb_orphan(skb);
+
+ np->stats.tx_bytes += skb->len;
+ np->stats.tx_packets++;
+
+ /* Switch to loopback context. */
+ dev = np->loopback_dev;
+ np = netdev_priv(dev);
+
+ np->stats.rx_bytes += skb->len;
+ np->stats.rx_packets++;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ /* Defer checksum calculation. */
+ skb->proto_csum_blank = 1;
+ /* Must be a local packet: assert its integrity. */
+ skb->proto_data_valid = 1;
+ }
+
+ skb->ip_summed = skb->proto_data_valid ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+
+ skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
+ skb->protocol = eth_type_trans(skb, dev);
+ skb->dev = dev;
+ dev->last_rx = jiffies;
+
+ /* Flush netfilter context: rx'ed skbuffs not expected to have any. */
+ nf_reset(skb);
+ secpath_reset(skb);
+
+ netif_rx(skb);
+
+ return 0;
+}
+
+static struct net_device_stats *loopback_get_stats(struct net_device *dev)
+{
+ struct net_private *np = netdev_priv(dev);
+ return &np->stats;
+}
+
+static struct ethtool_ops network_ethtool_ops =
+{
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = ethtool_op_set_tx_csum,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = ethtool_op_set_sg,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = ethtool_op_set_tso,
+ .get_link = ethtool_op_get_link,
+};
+
+/*
+ * Nothing to do here. Virtual interface is point-to-point and the
+ * physical interface is probably promiscuous anyway.
+ */
+static void loopback_set_multicast_list(struct net_device *dev)
+{
+}
+
+static void loopback_construct(struct net_device *dev, struct net_device *lo)
+{
+ struct net_private *np = netdev_priv(dev);
+
+ np->loopback_dev = lo;
+
+ dev->open = loopback_open;
+ dev->stop = loopback_close;
+ dev->hard_start_xmit = loopback_start_xmit;
+ dev->get_stats = loopback_get_stats;
+ dev->set_multicast_list = loopback_set_multicast_list;
+ dev->change_mtu = NULL; /* allow arbitrary mtu */
+
+ dev->tx_queue_len = 0;
+
+ dev->features = (NETIF_F_HIGHDMA |
+ NETIF_F_LLTX |
+ NETIF_F_TSO |
+ NETIF_F_SG |
+ NETIF_F_IP_CSUM);
+
+ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
+ /*
+ * We do not set a jumbo MTU on the interface. Otherwise the network
+ * stack will try to send large packets that will get dropped by the
+ * Ethernet bridge (unless the physical Ethernet interface is
+ * configured to transfer jumbo packets). If a larger MTU is desired
+ * then the system administrator can specify it using the 'ifconfig'
+ * command.
+ */
+ /*dev->mtu = 16*1024;*/
+}
+
+static int __init make_loopback(int i)
+{
+ struct net_device *dev1, *dev2;
+ char dev_name[IFNAMSIZ];
+ int err = -ENOMEM;
+
+ sprintf(dev_name, "vif0.%d", i);
+ dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
+ if (!dev1)
+ return err;
+
+ sprintf(dev_name, "veth%d", i);
+ dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
+ if (!dev2)
+ goto fail_netdev2;
+
+ loopback_construct(dev1, dev2);
+ loopback_construct(dev2, dev1);
+
+ /*
+ * Initialise a dummy MAC address for the 'dummy backend' interface. We
+ * choose the numerically largest non-broadcast address to prevent the
+ * address getting stolen by an Ethernet bridge for STP purposes.
+ */
+ memset(dev1->dev_addr, 0xFF, ETH_ALEN);
+ dev1->dev_addr[0] &= ~0x01;
+
+ if ((err = register_netdev(dev1)) != 0)
+ goto fail;
+
+ if ((err = register_netdev(dev2)) != 0) {
+ unregister_netdev(dev1);
+ goto fail;
+ }
+
+ return 0;
+
+ fail:
+ free_netdev(dev2);
+ fail_netdev2:
+ free_netdev(dev1);
+ return err;
+}
+
+static void __exit clean_loopback(int i)
+{
+ struct net_device *dev1, *dev2;
+ char dev_name[IFNAMSIZ];
+
+ sprintf(dev_name, "vif0.%d", i);
+ dev1 = dev_get_by_name(&init_net, dev_name);
+ sprintf(dev_name, "veth%d", i);
+ dev2 = dev_get_by_name(&init_net, dev_name);
+ if (dev1 && dev2) {
+ unregister_netdev(dev2);
+ unregister_netdev(dev1);
+ free_netdev(dev2);
+ free_netdev(dev1);
+ }
+}
+
+static int __init loopback_init(void)
+{
+ int i, err = 0;
+
+ if (nloopbacks == -1)
+ nloopbacks = is_initial_xendomain() ? 4 : 0;
+
+ for (i = 0; i < nloopbacks; i++)
+ if ((err = make_loopback(i)) != 0)
+ break;
+
+ return err;
+}
+
+module_init(loopback_init);
+
+static void __exit loopback_exit(void)
+{
+ int i;
+
+ for (i = nloopbacks; i-- > 0; )
+ clean_loopback(i);
+}
+
+module_exit(loopback_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/******************************************************************************
+ * drivers/xen/netback/netback.c
+ *
+ * Back-end of the driver for virtual network devices. This portion of the
+ * driver exports a 'unified' network-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * drivers/xen/netfront/netfront.c
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/balloon.h>
+#include <xen/interface/memory.h>
+
+/*define NETBE_DEBUG_INTERRUPT*/
+
+/* extra field used in struct page */
+#define netif_page_index(pg) (*(long *)&(pg)->mapping)
+
+struct netbk_rx_meta {
+ skb_frag_t frag;
+ int id;
+ u8 copy:1;
+};
+
+struct netbk_tx_pending_inuse {
+ struct list_head list;
+ unsigned long alloc_time;
+};
+
+static void netif_idx_release(u16 pending_idx);
+static void netif_page_release(struct page *page);
+static void make_tx_response(netif_t *netif,
+ netif_tx_request_t *txp,
+ s8 st);
+static netif_rx_response_t *make_rx_response(netif_t *netif,
+ u16 id,
+ s8 st,
+ u16 offset,
+ u16 size,
+ u16 flags);
+
+static void net_tx_action(unsigned long unused);
+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
+
+static void net_rx_action(unsigned long unused);
+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+
+static struct timer_list net_timer;
+static struct timer_list netbk_tx_pending_timer;
+
+#define MAX_PENDING_REQS 256
+
+static struct sk_buff_head rx_queue;
+
+static struct page **mmap_pages;
+static inline unsigned long idx_to_pfn(unsigned int idx)
+{
+ return page_to_pfn(mmap_pages[idx]);
+}
+
+static inline unsigned long idx_to_kaddr(unsigned int idx)
+{
+ return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
+}
+
+#define PKT_PROT_LEN 64
+
+static struct pending_tx_info {
+ netif_tx_request_t req;
+ netif_t *netif;
+} pending_tx_info[MAX_PENDING_REQS];
+static u16 pending_ring[MAX_PENDING_REQS];
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
+static u16 dealloc_ring[MAX_PENDING_REQS];
+static PEND_RING_IDX dealloc_prod, dealloc_cons;
+
+/* Doubly-linked list of in-use pending entries. */
+static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
+static LIST_HEAD(pending_inuse_head);
+
+static struct sk_buff_head tx_queue;
+
+static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
+static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
+static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
+
+static struct list_head net_schedule_list;
+static spinlock_t net_schedule_list_lock;
+
+#define MAX_MFN_ALLOC 64
+static unsigned long mfn_list[MAX_MFN_ALLOC];
+static unsigned int alloc_index = 0;
+
+/* Setting this allows the safe use of this driver without netloop. */
+static int MODPARM_copy_skb = 1;
+module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
+MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
+
+int netbk_copy_skb_mode;
+
+static inline unsigned long alloc_mfn(void)
+{
+ BUG_ON(alloc_index == 0);
+ return mfn_list[--alloc_index];
+}
+
+static int check_mfn(int nr)
+{
+ struct xen_memory_reservation reservation = {
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ int rc;
+
+ if (likely(alloc_index >= nr))
+ return 0;
+
+ set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
+ reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
+ rc = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);
+ if (likely(rc > 0))
+ alloc_index += rc;
+
+ return alloc_index >= nr ? 0 : -ENOMEM;
+}
+
+static inline void maybe_schedule_tx_action(void)
+{
+ smp_mb();
+ if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+ !list_empty(&net_schedule_list))
+ tasklet_schedule(&net_tx_tasklet);
+}
+
+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
+{
+ struct skb_shared_info *ninfo;
+ struct sk_buff *nskb;
+ unsigned long offset;
+ int ret;
+ int len;
+ int headlen;
+
+ BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+
+ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!nskb))
+ goto err;
+
+ skb_reserve(nskb, 16 + NET_IP_ALIGN);
+ headlen = skb_end_pointer(nskb) - nskb->data;
+ if (headlen > skb_headlen(skb))
+ headlen = skb_headlen(skb);
+ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
+ BUG_ON(ret);
+
+ ninfo = skb_shinfo(nskb);
+ ninfo->gso_size = skb_shinfo(skb)->gso_size;
+ ninfo->gso_type = skb_shinfo(skb)->gso_type;
+
+ offset = headlen;
+ len = skb->len - headlen;
+
+ nskb->len = skb->len;
+ nskb->data_len = len;
+ nskb->truesize += len;
+
+ while (len) {
+ struct page *page;
+ int copy;
+ int zero;
+
+ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
+ dump_stack();
+ goto err_free;
+ }
+
+ copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
+ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
+
+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
+ if (unlikely(!page))
+ goto err_free;
+
+ ret = skb_copy_bits(skb, offset, page_address(page), copy);
+ BUG_ON(ret);
+
+ ninfo->frags[ninfo->nr_frags].page = page;
+ ninfo->frags[ninfo->nr_frags].page_offset = 0;
+ ninfo->frags[ninfo->nr_frags].size = copy;
+ ninfo->nr_frags++;
+
+ offset += copy;
+ len -= copy;
+ }
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ offset = 0;
+#else
+ offset = nskb->data - skb->data;
+#endif
+
+ nskb->transport_header = skb->transport_header + offset;
+ nskb->network_header = skb->network_header + offset;
+ nskb->mac_header = skb->mac_header + offset;
+
+ return nskb;
+
+ err_free:
+ kfree_skb(nskb);
+ err:
+ return NULL;
+}
+
+static inline int netbk_max_required_rx_slots(netif_t *netif)
+{
+ if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
+ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
+ return 1; /* all in one */
+}
+
+static inline int netbk_queue_full(netif_t *netif)
+{
+ RING_IDX peek = netif->rx_req_cons_peek;
+ RING_IDX needed = netbk_max_required_rx_slots(netif);
+
+ return ((netif->rx.sring->req_prod - peek) < needed) ||
+ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
+}
+
+static void tx_queue_callback(unsigned long data)
+{
+ netif_t *netif = (netif_t *)data;
+ if (netif_schedulable(netif))
+ netif_wake_queue(netif->dev);
+}
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ netif_t *netif = netdev_priv(dev);
+
+ BUG_ON(skb->dev != dev);
+
+ /* Drop the packet if the target domain has no receive buffers. */
+ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
+ goto drop;
+
+ /*
+ * Copy the packet here if it's destined for a flipping interface
+ * but isn't flippable (e.g. extra references to data).
+ * XXX For now we also copy skbuffs whose head crosses a page
+ * boundary, because netbk_gop_skb can't handle them.
+ */
+ if (!netif->copying_receiver ||
+ ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE)) {
+ struct sk_buff *nskb = netbk_copy_skb(skb);
+ if ( unlikely(nskb == NULL) )
+ goto drop;
+ /* Copy only the header fields we use in this driver. */
+ nskb->dev = skb->dev;
+ nskb->ip_summed = skb->ip_summed;
+ nskb->proto_data_valid = skb->proto_data_valid;
+ dev_kfree_skb(skb);
+ skb = nskb;
+ }
+
+ netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
+ !!skb_shinfo(skb)->gso_size;
+ netif_get(netif);
+
+ if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
+ netif->rx.sring->req_event = netif->rx_req_cons_peek +
+ netbk_max_required_rx_slots(netif);
+ mb(); /* request notification /then/ check & stop the queue */
+ if (netbk_queue_full(netif)) {
+ netif_stop_queue(dev);
+ /*
+ * Schedule 500ms timeout to restart the queue, thus
+ * ensuring that an inactive queue will be drained.
+ * Packets will be immediately be dropped until more
+ * receive buffers become available (see
+ * netbk_queue_full() check above).
+ */
+ netif->tx_queue_timeout.data = (unsigned long)netif;
+ netif->tx_queue_timeout.function = tx_queue_callback;
+ __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
+ }
+ }
+
+ skb_queue_tail(&rx_queue, skb);
+ tasklet_schedule(&net_rx_tasklet);
+
+ return 0;
+
+ drop:
+ netif->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+#if 0
+static void xen_network_done_notify(void)
+{
+ static struct net_device *eth0_dev = NULL;
+ if (unlikely(eth0_dev == NULL))
+ eth0_dev = __dev_get_by_name(&init_net, "eth0");
+ netif_rx_schedule(eth0_dev, ???);
+}
+/*
+ * Add following to poll() function in NAPI driver (Tigon3 is example):
+ * if ( xen_network_done() )
+ * tg3_enable_ints(tp);
+ */
+int xen_network_done(void)
+{
+ return skb_queue_empty(&rx_queue);
+}
+#endif
+
+struct netrx_pending_operations {
+ unsigned trans_prod, trans_cons;
+ unsigned mmu_prod, mmu_mcl;
+ unsigned mcl_prod, mcl_cons;
+ unsigned copy_prod, copy_cons;
+ unsigned meta_prod, meta_cons;
+ mmu_update_t *mmu;
+ gnttab_transfer_t *trans;
+ gnttab_copy_t *copy;
+ multicall_entry_t *mcl;
+ struct netbk_rx_meta *meta;
+};
+
+/* Set up the grant operations for this fragment. If it's a flipping
+ interface, we also set up the unmap request from here. */
+static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
+ int i, struct netrx_pending_operations *npo,
+ struct page *page, unsigned long size,
+ unsigned long offset)
+{
+ mmu_update_t *mmu;
+ gnttab_transfer_t *gop;
+ gnttab_copy_t *copy_gop;
+ multicall_entry_t *mcl;
+ netif_rx_request_t *req;
+ unsigned long old_mfn, new_mfn;
+
+ old_mfn = virt_to_mfn(page_address(page));
+
+ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
+ if (netif->copying_receiver) {
+ /* The fragment needs to be copied rather than
+ flipped. */
+ meta->copy = 1;
+ copy_gop = npo->copy + npo->copy_prod++;
+ copy_gop->flags = GNTCOPY_dest_gref;
+ if (PageForeign(page)) {
+ struct pending_tx_info *src_pend =
+ &pending_tx_info[netif_page_index(page)];
+ copy_gop->source.domid = src_pend->netif->domid;
+ copy_gop->source.u.ref = src_pend->req.gref;
+ copy_gop->flags |= GNTCOPY_source_gref;
+ } else {
+ copy_gop->source.domid = DOMID_SELF;
+ copy_gop->source.u.gmfn = old_mfn;
+ }
+ copy_gop->source.offset = offset;
+ copy_gop->dest.domid = netif->domid;
+ copy_gop->dest.offset = 0;
+ copy_gop->dest.u.ref = req->gref;
+ copy_gop->len = size;
+ } else {
+ meta->copy = 0;
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ new_mfn = alloc_mfn();
+
+ /*
+ * Set the new P2M table entry before
+ * reassigning the old data page. Heed the
+ * comment in pgtable-2level.h:pte_page(). :-)
+ */
+ set_phys_to_machine(page_to_pfn(page), new_mfn);
+
+ mcl = npo->mcl + npo->mcl_prod++;
+ MULTI_update_va_mapping(mcl,
+ (unsigned long)page_address(page),
+ pfn_pte_ma(new_mfn, PAGE_KERNEL),
+ 0);
+
+ mmu = npo->mmu + npo->mmu_prod++;
+ mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
+ MMU_MACHPHYS_UPDATE;
+ mmu->val = page_to_pfn(page);
+ }
+
+ gop = npo->trans + npo->trans_prod++;
+ gop->mfn = old_mfn;
+ gop->domid = netif->domid;
+ gop->ref = req->gref;
+ }
+ return req->id;
+}
+
+static void netbk_gop_skb(struct sk_buff *skb,
+ struct netrx_pending_operations *npo)
+{
+ netif_t *netif = netdev_priv(skb->dev);
+ int nr_frags = skb_shinfo(skb)->nr_frags;
+ int i;
+ int extra;
+ struct netbk_rx_meta *head_meta, *meta;
+
+ head_meta = npo->meta + npo->meta_prod++;
+ head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
+ head_meta->frag.size = skb_shinfo(skb)->gso_size;
+ extra = !!head_meta->frag.size + 1;
+
+ for (i = 0; i < nr_frags; i++) {
+ meta = npo->meta + npo->meta_prod++;
+ meta->frag = skb_shinfo(skb)->frags[i];
+ meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
+ meta->frag.page,
+ meta->frag.size,
+ meta->frag.page_offset);
+ }
+
+ /*
+ * This must occur at the end to ensure that we don't trash skb_shinfo
+ * until we're done. We know that the head doesn't cross a page
+ * boundary because such packets get copied in netif_be_start_xmit.
+ */
+ head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
+ virt_to_page(skb->data),
+ skb_headlen(skb),
+ offset_in_page(skb->data));
+
+ netif->rx.req_cons += nr_frags + extra;
+}
+
+static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
+{
+ int i;
+
+ for (i = 0; i < nr_frags; i++)
+ put_page(meta[i].frag.page);
+}
+
+/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
+ used to set up the operations on the top of
+ netrx_pending_operations, which have since been done. Check that
+ they didn't give any errors and advance over them. */
+static int netbk_check_gop(int nr_frags, domid_t domid,
+ struct netrx_pending_operations *npo)
+{
+ multicall_entry_t *mcl;
+ gnttab_transfer_t *gop;
+ gnttab_copy_t *copy_op;
+ int status = NETIF_RSP_OKAY;
+ int i;
+
+ for (i = 0; i <= nr_frags; i++) {
+ if (npo->meta[npo->meta_cons + i].copy) {
+ copy_op = npo->copy + npo->copy_cons++;
+ if (copy_op->status != GNTST_okay) {
+ DPRINTK("Bad status %d from copy to DOM%d.\n",
+ copy_op->status, domid);
+ status = NETIF_RSP_ERROR;
+ }
+ } else {
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ mcl = npo->mcl + npo->mcl_cons++;
+ /* The update_va_mapping() must not fail. */
+ BUG_ON(mcl->result != 0);
+ }
+
+ gop = npo->trans + npo->trans_cons++;
+ /* Check the reassignment error code. */
+ if (gop->status != 0) {
+ DPRINTK("Bad status %d from grant transfer to DOM%u\n",
+ gop->status, domid);
+ /*
+ * Page no longer belongs to us unless
+ * GNTST_bad_page, but that should be
+ * a fatal error anyway.
+ */
+ BUG_ON(gop->status == GNTST_bad_page);
+ status = NETIF_RSP_ERROR;
+ }
+ }
+ }
+
+ return status;
+}
+
+static void netbk_add_frag_responses(netif_t *netif, int status,
+ struct netbk_rx_meta *meta, int nr_frags)
+{
+ int i;
+ unsigned long offset;
+
+ for (i = 0; i < nr_frags; i++) {
+ int id = meta[i].id;
+ int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
+
+ if (meta[i].copy)
+ offset = 0;
+ else
+ offset = meta[i].frag.page_offset;
+ make_rx_response(netif, id, status, offset,
+ meta[i].frag.size, flags);
+ }
+}
+
+static void net_rx_action(unsigned long unused)
+{
+ netif_t *netif = NULL;
+ s8 status;
+ u16 id, irq, flags;
+ netif_rx_response_t *resp;
+ multicall_entry_t *mcl;
+ struct sk_buff_head rxq;
+ struct sk_buff *skb;
+ int notify_nr = 0;
+ int ret;
+ int nr_frags;
+ int count;
+ unsigned long offset;
+
+ /*
+ * Putting hundreds of bytes on the stack is considered rude.
+ * Static works because a tasklet can only be on one CPU at any time.
+ */
+ static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
+ static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
+ static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
+ static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
+ static unsigned char rx_notify[NR_IRQS];
+ static u16 notify_list[NET_RX_RING_SIZE];
+ static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
+ struct netrx_pending_operations npo = {
+ mmu: rx_mmu,
+ trans: grant_trans_op,
+ copy: grant_copy_op,
+ mcl: rx_mcl,
+ meta: meta};
+
+ skb_queue_head_init(&rxq);
+
+ count = 0;
+
+ while ((skb = skb_dequeue(&rx_queue)) != NULL) {
+ nr_frags = skb_shinfo(skb)->nr_frags;
+ *(int *)skb->cb = nr_frags;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap) &&
+ !((netif_t *)netdev_priv(skb->dev))->copying_receiver &&
+ check_mfn(nr_frags + 1)) {
+ /* Memory squeeze? Back off for an arbitrary while. */
+ if ( net_ratelimit() )
+ WPRINTK("Memory squeeze in netback "
+ "driver.\n");
+ mod_timer(&net_timer, jiffies + HZ);
+ skb_queue_head(&rx_queue, skb);
+ break;
+ }
+
+ netbk_gop_skb(skb, &npo);
+
+ count += nr_frags + 1;
+
+ __skb_queue_tail(&rxq, skb);
+
+ /* Filled the batch queue? */
+ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
+ break;
+ }
+
+ BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
+
+ npo.mmu_mcl = npo.mcl_prod;
+ if (npo.mcl_prod) {
+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+ BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
+ mcl = npo.mcl + npo.mcl_prod++;
+
+ BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
+ mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
+
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)rx_mmu;
+ mcl->args[1] = npo.mmu_prod;
+ mcl->args[2] = 0;
+ mcl->args[3] = DOMID_SELF;
+ }
+
+ if (npo.trans_prod) {
+ BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
+ mcl = npo.mcl + npo.mcl_prod++;
+ mcl->op = __HYPERVISOR_grant_table_op;
+ mcl->args[0] = GNTTABOP_transfer;
+ mcl->args[1] = (unsigned long)grant_trans_op;
+ mcl->args[2] = npo.trans_prod;
+ }
+
+ if (npo.copy_prod) {
+ BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
+ mcl = npo.mcl + npo.mcl_prod++;
+ mcl->op = __HYPERVISOR_grant_table_op;
+ mcl->args[0] = GNTTABOP_copy;
+ mcl->args[1] = (unsigned long)grant_copy_op;
+ mcl->args[2] = npo.copy_prod;
+ }
+
+ /* Nothing to do? */
+ if (!npo.mcl_prod)
+ return;
+
+ BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
+
+ ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
+ BUG_ON(ret != 0);
+ /* The mmu_machphys_update() must not fail. */
+ BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
+
+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
+ nr_frags = *(int *)skb->cb;
+
+ netif = netdev_priv(skb->dev);
+ /* We can't rely on skb_release_data to release the
+ pages used by fragments for us, since it tries to
+ touch the pages in the fraglist. If we're in
+ flipping mode, that doesn't work. In copying mode,
+ we still have access to all of the pages, and so
+ it's safe to let release_data deal with it. */
+ /* (Freeing the fragments is safe since we copy
+ non-linear skbs destined for flipping interfaces) */
+ if (!netif->copying_receiver) {
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->frag_list = NULL;
+ skb_shinfo(skb)->nr_frags = 0;
+ netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
+ }
+
+ netif->stats.tx_bytes += skb->len;
+ netif->stats.tx_packets++;
+
+ status = netbk_check_gop(nr_frags, netif->domid, &npo);
+
+ id = meta[npo.meta_cons].id;
+ flags = nr_frags ? NETRXF_more_data : 0;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+ flags |= NETRXF_csum_blank | NETRXF_data_validated;
+ else if (skb->proto_data_valid) /* remote but checksummed? */
+ flags |= NETRXF_data_validated;
+
+ if (meta[npo.meta_cons].copy)
+ offset = 0;
+ else
+ offset = offset_in_page(skb->data);
+ resp = make_rx_response(netif, id, status, offset,
+ skb_headlen(skb), flags);
+
+ if (meta[npo.meta_cons].frag.size) {
+ struct netif_extra_info *gso =
+ (struct netif_extra_info *)
+ RING_GET_RESPONSE(&netif->rx,
+ netif->rx.rsp_prod_pvt++);
+
+ resp->flags |= NETRXF_extra_info;
+
+ gso->u.gso.size = meta[npo.meta_cons].frag.size;
+ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+ gso->u.gso.pad = 0;
+ gso->u.gso.features = 0;
+
+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+ gso->flags = 0;
+ }
+
+ netbk_add_frag_responses(netif, status,
+ meta + npo.meta_cons + 1,
+ nr_frags);
+
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
+ irq = netif->irq;
+ if (ret && !rx_notify[irq]) {
+ rx_notify[irq] = 1;
+ notify_list[notify_nr++] = irq;
+ }
+
+ if (netif_queue_stopped(netif->dev) &&
+ netif_schedulable(netif) &&
+ !netbk_queue_full(netif))
+ netif_wake_queue(netif->dev);
+
+ netif_put(netif);
+ dev_kfree_skb(skb);
+ npo.meta_cons += nr_frags + 1;
+ }
+
+ while (notify_nr != 0) {
+ irq = notify_list[--notify_nr];
+ rx_notify[irq] = 0;
+ notify_remote_via_irq(irq);
+ }
+
+ /* More work to do? */
+ if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
+ tasklet_schedule(&net_rx_tasklet);
+#if 0
+ else
+ xen_network_done_notify();
+#endif
+}
+
+static void net_alarm(unsigned long unused)
+{
+ tasklet_schedule(&net_rx_tasklet);
+}
+
+static void netbk_tx_pending_timeout(unsigned long unused)
+{
+ tasklet_schedule(&net_tx_tasklet);
+}
+
+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
+{
+ netif_t *netif = netdev_priv(dev);
+ return &netif->stats;
+}
+
+static int __on_net_schedule_list(netif_t *netif)
+{
+ return netif->list.next != NULL;
+}
+
+static void remove_from_net_schedule_list(netif_t *netif)
+{
+ spin_lock_irq(&net_schedule_list_lock);
+ if (likely(__on_net_schedule_list(netif))) {
+ list_del(&netif->list);
+ netif->list.next = NULL;
+ netif_put(netif);
+ }
+ spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static void add_to_net_schedule_list_tail(netif_t *netif)
+{
+ if (__on_net_schedule_list(netif))
+ return;
+
+ spin_lock_irq(&net_schedule_list_lock);
+ if (!__on_net_schedule_list(netif) &&
+ likely(netif_schedulable(netif))) {
+ list_add_tail(&netif->list, &net_schedule_list);
+ netif_get(netif);
+ }
+ spin_unlock_irq(&net_schedule_list_lock);
+}
+
+/*
+ * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
+ * If this driver is pipelining transmit requests then we can be very
+ * aggressive in avoiding new-packet notifications -- frontend only needs to
+ * send a notification if there are no outstanding unreceived responses.
+ * If we may be buffer transmit buffers for any reason then we must be rather
+ * more conservative and treat this as the final check for pending work.
+ */
+void netif_schedule_work(netif_t *netif)
+{
+ int more_to_do;
+
+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
+ more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
+#else
+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+#endif
+
+ if (more_to_do) {
+ add_to_net_schedule_list_tail(netif);
+ maybe_schedule_tx_action();
+ }
+}
+
+void netif_deschedule_work(netif_t *netif)
+{
+ remove_from_net_schedule_list(netif);
+}
+
+
+static void tx_add_credit(netif_t *netif)
+{
+ unsigned long max_burst, max_credit;
+
+ /*
+ * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
+ * Otherwise the interface can seize up due to insufficient credit.
+ */
+ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
+ max_burst = min(max_burst, 131072UL);
+ max_burst = max(max_burst, netif->credit_bytes);
+
+ /* Take care that adding a new chunk of credit doesn't wrap to zero. */
+ max_credit = netif->remaining_credit + netif->credit_bytes;
+ if (max_credit < netif->remaining_credit)
+ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
+
+ netif->remaining_credit = min(max_credit, max_burst);
+}
+
+static void tx_credit_callback(unsigned long data)
+{
+ netif_t *netif = (netif_t *)data;
+ tx_add_credit(netif);
+ netif_schedule_work(netif);
+}
+
+static inline int copy_pending_req(PEND_RING_IDX pending_idx)
+{
+ return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
+ &mmap_pages[pending_idx]);
+}
+
+inline static void net_tx_action_dealloc(void)
+{
+ struct netbk_tx_pending_inuse *inuse, *n;
+ gnttab_unmap_grant_ref_t *gop;
+ u16 pending_idx;
+ PEND_RING_IDX dc, dp;
+ netif_t *netif;
+ int ret;
+ LIST_HEAD(list);
+
+ dc = dealloc_cons;
+ gop = tx_unmap_ops;
+
+ /*
+ * Free up any grants we have finished using
+ */
+ do {
+ dp = dealloc_prod;
+
+ /* Ensure we see all indices enqueued by netif_idx_release(). */
+ smp_rmb();
+
+ while (dc != dp) {
+ unsigned long pfn;
+
+ pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
+ list_move_tail(&pending_inuse[pending_idx].list, &list);
+
+ pfn = idx_to_pfn(pending_idx);
+ /* Already unmapped? */
+ if (!phys_to_machine_mapping_valid(pfn))
+ continue;
+
+ gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
+ GNTMAP_host_map,
+ grant_tx_handle[pending_idx]);
+ gop++;
+ }
+
+ if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
+ list_empty(&pending_inuse_head))
+ break;
+
+ /* Copy any entries that have been pending for too long. */
+ list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
+ if (time_after(inuse->alloc_time + HZ / 2, jiffies))
+ break;
+
+ switch (copy_pending_req(inuse - pending_inuse)) {
+ case 0:
+ list_move_tail(&inuse->list, &list);
+ continue;
+ case -EBUSY:
+ list_del_init(&inuse->list);
+ continue;
+ case -ENOENT:
+ continue;
+ }
+
+ break;
+ }
+ } while (dp != dealloc_prod);
+
+ dealloc_cons = dc;
+
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
+ BUG_ON(ret);
+
+ list_for_each_entry_safe(inuse, n, &list, list) {
+ pending_idx = inuse - pending_inuse;
+
+ netif = pending_tx_info[pending_idx].netif;
+
+ make_tx_response(netif, &pending_tx_info[pending_idx].req,
+ NETIF_RSP_OKAY);
+
+ /* Ready for next use. */
+ gnttab_reset_grant_page(mmap_pages[pending_idx]);
+
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+
+ netif_put(netif);
+
+ list_del_init(&inuse->list);
+ }
+}
+
+static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
+{
+ RING_IDX cons = netif->tx.req_cons;
+
+ do {
+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
+ if (cons >= end)
+ break;
+ txp = RING_GET_REQUEST(&netif->tx, cons++);
+ } while (1);
+ netif->tx.req_cons = cons;
+ netif_schedule_work(netif);
+ netif_put(netif);
+}
+
+static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
+ netif_tx_request_t *txp, int work_to_do)
+{
+ RING_IDX cons = netif->tx.req_cons;
+ int frags = 0;
+
+ if (!(first->flags & NETTXF_more_data))
+ return 0;
+
+ do {
+ if (frags >= work_to_do) {
+ DPRINTK("Need more frags\n");
+ return -frags;
+ }
+
+ if (unlikely(frags >= MAX_SKB_FRAGS)) {
+ DPRINTK("Too many frags\n");
+ return -frags;
+ }
+
+ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
+ sizeof(*txp));
+ if (txp->size > first->size) {
+ DPRINTK("Frags galore\n");
+ return -frags;
+ }
+
+ first->size -= txp->size;
+ frags++;
+
+ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+ DPRINTK("txp->offset: %x, size: %u\n",
+ txp->offset, txp->size);
+ return -frags;
+ }
+ } while ((txp++)->flags & NETTXF_more_data);
+
+ return frags;
+}
+
+static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
+ struct sk_buff *skb,
+ netif_tx_request_t *txp,
+ gnttab_map_grant_ref_t *mop)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ skb_frag_t *frags = shinfo->frags;
+ unsigned long pending_idx = *((u16 *)skb->data);
+ int i, start;
+
+ /* Skip first skb fragment if it is on same page as header fragment. */
+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+ for (i = start; i < shinfo->nr_frags; i++, txp++) {
+ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
+
+ gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txp->gref, netif->domid);
+
+ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+ netif_get(netif);
+ pending_tx_info[pending_idx].netif = netif;
+ frags[i].page = (void *)pending_idx;
+ }
+
+ return mop;
+}
+
+static int netbk_tx_check_mop(struct sk_buff *skb,
+ gnttab_map_grant_ref_t **mopp)
+{
+ gnttab_map_grant_ref_t *mop = *mopp;
+ int pending_idx = *((u16 *)skb->data);
+ netif_t *netif = pending_tx_info[pending_idx].netif;
+ netif_tx_request_t *txp;
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int nr_frags = shinfo->nr_frags;
+ int i, err, start;
+
+ /* Check status of header. */
+ err = mop->status;
+ if (unlikely(err)) {
+ txp = &pending_tx_info[pending_idx].req;
+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+ netif_put(netif);
+ } else {
+ set_phys_to_machine(
+ __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
+ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
+ grant_tx_handle[pending_idx] = mop->handle;
+ }
+
+ /* Skip first skb fragment if it is on same page as header fragment. */
+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+ for (i = start; i < nr_frags; i++) {
+ int j, newerr;
+
+ pending_idx = (unsigned long)shinfo->frags[i].page;
+
+ /* Check error status: if okay then remember grant handle. */
+ newerr = (++mop)->status;
+ if (likely(!newerr)) {
+ set_phys_to_machine(
+ __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
+ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
+ grant_tx_handle[pending_idx] = mop->handle;
+ /* Had a previous error? Invalidate this fragment. */
+ if (unlikely(err))
+ netif_idx_release(pending_idx);
+ continue;
+ }
+
+ /* Error on this fragment: respond to client with an error. */
+ txp = &pending_tx_info[pending_idx].req;
+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+ netif_put(netif);
+
+ /* Not the first error? Preceding frags already invalidated. */
+ if (err)
+ continue;
+
+ /* First error: invalidate header and preceding fragments. */
+ pending_idx = *((u16 *)skb->data);
+ netif_idx_release(pending_idx);
+ for (j = start; j < i; j++) {
+ pending_idx = (unsigned long)shinfo->frags[i].page;
+ netif_idx_release(pending_idx);
+ }
+
+ /* Remember the error: invalidate all subsequent fragments. */
+ err = newerr;
+ }
+
+ *mopp = mop + 1;
+ return err;
+}
+
+static void netbk_fill_frags(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int nr_frags = shinfo->nr_frags;
+ int i;
+
+ for (i = 0; i < nr_frags; i++) {
+ skb_frag_t *frag = shinfo->frags + i;
+ netif_tx_request_t *txp;
+ unsigned long pending_idx;
+
+ pending_idx = (unsigned long)frag->page;
+
+ pending_inuse[pending_idx].alloc_time = jiffies;
+ list_add_tail(&pending_inuse[pending_idx].list,
+ &pending_inuse_head);
+
+ txp = &pending_tx_info[pending_idx].req;
+ frag->page = virt_to_page(idx_to_kaddr(pending_idx));
+ frag->size = txp->size;
+ frag->page_offset = txp->offset;
+
+ skb->len += txp->size;
+ skb->data_len += txp->size;
+ skb->truesize += txp->size;
+ }
+}
+
+int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
+ int work_to_do)
+{
+ struct netif_extra_info extra;
+ RING_IDX cons = netif->tx.req_cons;
+
+ do {
+ if (unlikely(work_to_do-- <= 0)) {
+ DPRINTK("Missing extra info\n");
+ return -EBADR;
+ }
+
+ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
+ sizeof(extra));
+ if (unlikely(!extra.type ||
+ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+ netif->tx.req_cons = ++cons;
+ DPRINTK("Invalid extra type: %d\n", extra.type);
+ return -EINVAL;
+ }
+
+ memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
+ netif->tx.req_cons = ++cons;
+ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+ return work_to_do;
+}
+
+static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
+{
+ if (!gso->u.gso.size) {
+ DPRINTK("GSO size must not be zero.\n");
+ return -EINVAL;
+ }
+
+ /* Currently only TCPv4 S.O. is supported. */
+ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+ return -EINVAL;
+ }
+
+ skb_shinfo(skb)->gso_size = gso->u.gso.size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+
+ return 0;
+}
+
+/* Called after netfront has transmitted */
+static void net_tx_action(unsigned long unused)
+{
+ struct list_head *ent;
+ struct sk_buff *skb;
+ netif_t *netif;
+ netif_tx_request_t txreq;
+ netif_tx_request_t txfrags[MAX_SKB_FRAGS];
+ struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+ u16 pending_idx;
+ RING_IDX i;
+ gnttab_map_grant_ref_t *mop;
+ unsigned int data_len;
+ int ret, work_to_do;
+
+ if (dealloc_cons != dealloc_prod)
+ net_tx_action_dealloc();
+
+ mop = tx_map_ops;
+ while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+ !list_empty(&net_schedule_list)) {
+ /* Get a netif from the list with work to do. */
+ ent = net_schedule_list.next;
+ netif = list_entry(ent, netif_t, list);
+ netif_get(netif);
+ remove_from_net_schedule_list(netif);
+
+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
+ if (!work_to_do) {
+ netif_put(netif);
+ continue;
+ }
+
+ i = netif->tx.req_cons;
+ rmb(); /* Ensure that we see the request before we copy it. */
+ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
+
+ /* Credit-based scheduling. */
+ if (txreq.size > netif->remaining_credit) {
+ unsigned long now = jiffies;
+ unsigned long next_credit =
+ netif->credit_timeout.expires +
+ msecs_to_jiffies(netif->credit_usec / 1000);
+
+ /* Timer could already be pending in rare cases. */
+ if (timer_pending(&netif->credit_timeout)) {
+ netif_put(netif);
+ continue;
+ }
+
+ /* Passed the point where we can replenish credit? */
+ if (time_after_eq(now, next_credit)) {
+ netif->credit_timeout.expires = now;
+ tx_add_credit(netif);
+ }
+
+ /* Still too big to send right now? Set a callback. */
+ if (txreq.size > netif->remaining_credit) {
+ netif->credit_timeout.data =
+ (unsigned long)netif;
+ netif->credit_timeout.function =
+ tx_credit_callback;
+ __mod_timer(&netif->credit_timeout,
+ next_credit);
+ netif_put(netif);
+ continue;
+ }
+ }
+ netif->remaining_credit -= txreq.size;
+
+ work_to_do--;
+ netif->tx.req_cons = ++i;
+
+ memset(extras, 0, sizeof(extras));
+ if (txreq.flags & NETTXF_extra_info) {
+ work_to_do = netbk_get_extras(netif, extras,
+ work_to_do);
+ i = netif->tx.req_cons;
+ if (unlikely(work_to_do < 0)) {
+ netbk_tx_err(netif, &txreq, i);
+ continue;
+ }
+ }
+
+ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
+ if (unlikely(ret < 0)) {
+ netbk_tx_err(netif, &txreq, i - ret);
+ continue;
+ }
+ i += ret;
+
+ if (unlikely(txreq.size < ETH_HLEN)) {
+ DPRINTK("Bad packet size: %d\n", txreq.size);
+ netbk_tx_err(netif, &txreq, i);
+ continue;
+ }
+
+ /* No crossing a page as the payload mustn't fragment. */
+ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
+ txreq.offset, txreq.size,
+ (txreq.offset &~PAGE_MASK) + txreq.size);
+ netbk_tx_err(netif, &txreq, i);
+ continue;
+ }
+
+ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+ data_len = (txreq.size > PKT_PROT_LEN &&
+ ret < MAX_SKB_FRAGS) ?
+ PKT_PROT_LEN : txreq.size;
+
+ skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(skb == NULL)) {
+ DPRINTK("Can't allocate a skb in start_xmit.\n");
+ netbk_tx_err(netif, &txreq, i);
+ break;
+ }
+
+ /* Packets passed to netif_rx() must have some headroom. */
+ skb_reserve(skb, 16 + NET_IP_ALIGN);
+
+ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+ struct netif_extra_info *gso;
+ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+ if (netbk_set_skb_gso(skb, gso)) {
+ kfree_skb(skb);
+ netbk_tx_err(netif, &txreq, i);
+ continue;
+ }
+ }
+
+ gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txreq.gref, netif->domid);
+ mop++;
+
+ memcpy(&pending_tx_info[pending_idx].req,
+ &txreq, sizeof(txreq));
+ pending_tx_info[pending_idx].netif = netif;
+ *((u16 *)skb->data) = pending_idx;
+
+ __skb_put(skb, data_len);
+
+ skb_shinfo(skb)->nr_frags = ret;
+ if (data_len < txreq.size) {
+ skb_shinfo(skb)->nr_frags++;
+ skb_shinfo(skb)->frags[0].page =
+ (void *)(unsigned long)pending_idx;
+ } else {
+ /* Discriminate from any valid pending_idx value. */
+ skb_shinfo(skb)->frags[0].page = (void *)~0UL;
+ }
+
+ __skb_queue_tail(&tx_queue, skb);
+
+ pending_cons++;
+
+ mop = netbk_get_requests(netif, skb, txfrags, mop);
+
+ netif->tx.req_cons = i;
+ netif_schedule_work(netif);
+
+ if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
+ break;
+ }
+
+ if (mop == tx_map_ops)
+ return;
+
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
+ BUG_ON(ret);
+
+ mop = tx_map_ops;
+ while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
+ netif_tx_request_t *txp;
+
+ pending_idx = *((u16 *)skb->data);
+ netif = pending_tx_info[pending_idx].netif;
+ txp = &pending_tx_info[pending_idx].req;
+
+ /* Check the remap error code. */
+ if (unlikely(netbk_tx_check_mop(skb, &mop))) {
+ DPRINTK("netback grant failed.\n");
+ skb_shinfo(skb)->nr_frags = 0;
+ kfree_skb(skb);
+ continue;
+ }
+
+ data_len = skb->len;
+ memcpy(skb->data,
+ (void *)(idx_to_kaddr(pending_idx)|txp->offset),
+ data_len);
+ if (data_len < txp->size) {
+ /* Append the packet payload as a fragment. */
+ txp->offset += data_len;
+ txp->size -= data_len;
+ } else {
+ /* Schedule a response immediately. */
+ netif_idx_release(pending_idx);
+ }
+
+ /*
+ * Old frontends do not assert data_validated but we
+ * can infer it from csum_blank so test both flags.
+ */
+ if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->proto_data_valid = 1;
+ } else {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->proto_data_valid = 0;
+ }
+ skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
+
+ netbk_fill_frags(skb);
+
+ skb->dev = netif->dev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ netif->stats.rx_bytes += skb->len;
+ netif->stats.rx_packets++;
+
+ if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
+ unlikely(skb_linearize(skb))) {
+ DPRINTK("Can't linearize skb in net_tx_action.\n");
+ kfree_skb(skb);
+ continue;
+ }
+
+ netif_rx(skb);
+ netif->dev->last_rx = jiffies;
+ }
+
+ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+ !list_empty(&pending_inuse_head)) {
+ struct netbk_tx_pending_inuse *oldest;
+
+ oldest = list_entry(pending_inuse_head.next,
+ struct netbk_tx_pending_inuse, list);
+ mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
+ }
+}
+
+static void netif_idx_release(u16 pending_idx)
+{
+ static DEFINE_SPINLOCK(_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&_lock, flags);
+ dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
+ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
+ smp_wmb();
+ dealloc_prod++;
+ spin_unlock_irqrestore(&_lock, flags);
+
+ tasklet_schedule(&net_tx_tasklet);
+}
+
+static void netif_page_release(struct page *page)
+{
+ netif_idx_release(netif_page_index(page));
+}
+
+irqreturn_t netif_be_int(int irq, void *dev_id)
+{
+ netif_t *netif = dev_id;
+
+ add_to_net_schedule_list_tail(netif);
+ maybe_schedule_tx_action();
+
+ if (netif_schedulable(netif) && !netbk_queue_full(netif))
+ netif_wake_queue(netif->dev);
+
+ return IRQ_HANDLED;
+}
+
+static void make_tx_response(netif_t *netif,
+ netif_tx_request_t *txp,
+ s8 st)
+{
+ RING_IDX i = netif->tx.rsp_prod_pvt;
+ netif_tx_response_t *resp;
+ int notify;
+
+ resp = RING_GET_RESPONSE(&netif->tx, i);
+ resp->id = txp->id;
+ resp->status = st;
+
+ if (txp->flags & NETTXF_extra_info)
+ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
+
+ netif->tx.rsp_prod_pvt = ++i;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
+ if (notify)
+ notify_remote_via_irq(netif->irq);
+
+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
+ if (i == netif->tx.req_cons) {
+ int more_to_do;
+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+ if (more_to_do)
+ add_to_net_schedule_list_tail(netif);
+ }
+#endif
+}
+
+static netif_rx_response_t *make_rx_response(netif_t *netif,
+ u16 id,
+ s8 st,
+ u16 offset,
+ u16 size,
+ u16 flags)
+{
+ RING_IDX i = netif->rx.rsp_prod_pvt;
+ netif_rx_response_t *resp;
+
+ resp = RING_GET_RESPONSE(&netif->rx, i);
+ resp->offset = offset;
+ resp->flags = flags;
+ resp->id = id;
+ resp->status = (s16)size;
+ if (st < 0)
+ resp->status = (s16)st;
+
+ netif->rx.rsp_prod_pvt = ++i;
+
+ return resp;
+}
+
+#ifdef NETBE_DEBUG_INTERRUPT
+static irqreturn_t netif_be_dbg(int irq, void *dev_id)
+{
+ struct list_head *ent;
+ netif_t *netif;
+ int i = 0;
+
+ printk(KERN_ALERT "netif_schedule_list:\n");
+ spin_lock_irq(&net_schedule_list_lock);
+
+ list_for_each (ent, &net_schedule_list) {
+ netif = list_entry(ent, netif_t, list);
+ printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
+ "rx_resp_prod=%08x\n",
+ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
+ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
+ netif->tx.req_cons, netif->tx.rsp_prod_pvt);
+ printk(KERN_ALERT " shared(rx_req_prod=%08x "
+ "rx_resp_prod=%08x\n",
+ netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
+ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
+ netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
+ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
+ netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
+ i++;
+ }
+
+ spin_unlock_irq(&net_schedule_list_lock);
+ printk(KERN_ALERT " ** End of netif_schedule_list **\n");
+
+ return IRQ_HANDLED;
+}
+#endif
+
+static int __init netback_init(void)
+{
+ int i;
+ struct page *page;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ /* We can increase reservation by this much in net_rx_action(). */
+ balloon_update_driver_allowance(NET_RX_RING_SIZE);
+
+ skb_queue_head_init(&rx_queue);
+ skb_queue_head_init(&tx_queue);
+
+ init_timer(&net_timer);
+ net_timer.data = 0;
+ net_timer.function = net_alarm;
+
+ init_timer(&netbk_tx_pending_timer);
+ netbk_tx_pending_timer.data = 0;
+ netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
+
+ mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
+ if (mmap_pages == NULL) {
+ printk("%s: out of memory\n", __FUNCTION__);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < MAX_PENDING_REQS; i++) {
+ page = mmap_pages[i];
+ SetPageForeign(page, netif_page_release);
+ netif_page_index(page) = i;
+ INIT_LIST_HEAD(&pending_inuse[i].list);
+ }
+
+ pending_cons = 0;
+ pending_prod = MAX_PENDING_REQS;
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ pending_ring[i] = i;
+
+ spin_lock_init(&net_schedule_list_lock);
+ INIT_LIST_HEAD(&net_schedule_list);
+
+ netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
+ if (MODPARM_copy_skb) {
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+ NULL, 0))
+ netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
+ else
+ netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
+ }
+
+ netif_accel_init();
+
+ netif_xenbus_init();
+
+#ifdef NETBE_DEBUG_INTERRUPT
+ (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
+ 0,
+ netif_be_dbg,
+ IRQF_SHARED,
+ "net-be-dbg",
+ &netif_be_dbg);
+#endif
+
+ return 0;
+}
+
+module_init(netback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/* Xenbus code for netif backend
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ Copyright (C) 2005 XenSource Ltd
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+#if 0
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+#endif
+
+
+static int connect_rings(struct backend_info *);
+static void connect(struct backend_info *);
+static void backend_create_netif(struct backend_info *be);
+
+static int netback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev->dev.driver_data;
+
+ netback_remove_accelerators(be, dev);
+
+ if (be->netif) {
+ netif_disconnect(be->netif);
+ be->netif = NULL;
+ }
+ kfree(be);
+ dev->dev.driver_data = NULL;
+ return 0;
+}
+
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures and switch to InitWait.
+ */
+static int netback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ const char *message;
+ struct xenbus_transaction xbt;
+ int err;
+ int sg;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+
+ be->dev = dev;
+ dev->dev.driver_data = be;
+
+ sg = 1;
+ if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
+ sg = 0;
+
+ do {
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ goto fail;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
+ if (err) {
+ message = "writing feature-sg";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
+ "%d", sg);
+ if (err) {
+ message = "writing feature-gso-tcpv4";
+ goto abort_transaction;
+ }
+
+ /* We support rx-copy path. */
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-rx-copy", "%d", 1);
+ if (err) {
+ message = "writing feature-rx-copy";
+ goto abort_transaction;
+ }
+
+ /*
+ * We don't support rx-flip path (except old guests who don't
+ * grok this feature flag).
+ */
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-rx-flip", "%d", 0);
+ if (err) {
+ message = "writing feature-rx-flip";
+ goto abort_transaction;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ } while (err == -EAGAIN);
+
+ if (err) {
+ xenbus_dev_fatal(dev, err, "completing transaction");
+ goto fail;
+ }
+
+ netback_probe_accelerators(be, dev);
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+
+ /* This kicks hotplug scripts, so do it immediately. */
+ backend_create_netif(be);
+
+ return 0;
+
+abort_transaction:
+ xenbus_transaction_end(xbt, 1);
+ xenbus_dev_fatal(dev, err, "%s", message);
+fail:
+ DPRINTK("failed");
+ netback_remove(dev);
+ return err;
+}
+
+
+/**
+ * Handle the creation of the hotplug script environment. We add the script
+ * and vif variables to the environment, for the benefit of the vif-* hotplug
+ * scripts.
+ */
+static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
+{
+ struct backend_info *be = xdev->dev.driver_data;
+ netif_t *netif = be->netif;
+ char *val;
+
+ DPRINTK("netback_uevent");
+
+ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
+ if (IS_ERR(val)) {
+ int err = PTR_ERR(val);
+ xenbus_dev_fatal(xdev, err, "reading script");
+ return err;
+ }
+ else {
+ add_uevent_var(env, "script=%s", val);
+ kfree(val);
+ }
+
+ add_uevent_var(env, "vif=%s", netif->dev->name);
+
+ return 0;
+}
+
+
+static void backend_create_netif(struct backend_info *be)
+{
+ int err;
+ long handle;
+ struct xenbus_device *dev = be->dev;
+
+ if (be->netif != NULL)
+ return;
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
+ if (err != 1) {
+ xenbus_dev_fatal(dev, err, "reading handle");
+ return;
+ }
+
+ be->netif = netif_alloc(dev->otherend_id, handle);
+ if (IS_ERR(be->netif)) {
+ err = PTR_ERR(be->netif);
+ be->netif = NULL;
+ xenbus_dev_fatal(dev, err, "creating interface");
+ return;
+ }
+
+ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev->dev.driver_data;
+
+ DPRINTK("%s", xenbus_strstate(frontend_state));
+
+ be->frontend_state = frontend_state;
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ break;
+
+ case XenbusStateConnected:
+ backend_create_netif(be);
+ if (be->netif)
+ connect(be);
+ break;
+
+ case XenbusStateClosing:
+ if (be->netif) {
+ netif_disconnect(be->netif);
+ be->netif = NULL;
+ }
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ if (be->netif != NULL)
+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+static void xen_net_read_rate(struct xenbus_device *dev,
+ unsigned long *bytes, unsigned long *usec)
+{
+ char *s, *e;
+ unsigned long b, u;
+ char *ratestr;
+
+ /* Default to unlimited bandwidth. */
+ *bytes = ~0UL;
+ *usec = 0;
+
+ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
+ if (IS_ERR(ratestr))
+ return;
+
+ s = ratestr;
+ b = simple_strtoul(s, &e, 10);
+ if ((s == e) || (*e != ','))
+ goto fail;
+
+ s = e + 1;
+ u = simple_strtoul(s, &e, 10);
+ if ((s == e) || (*e != '\0'))
+ goto fail;
+
+ *bytes = b;
+ *usec = u;
+
+ kfree(ratestr);
+ return;
+
+ fail:
+ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
+ kfree(ratestr);
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+ char *s, *e, *macstr;
+ int i;
+
+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+ if (IS_ERR(macstr))
+ return PTR_ERR(macstr);
+
+ for (i = 0; i < ETH_ALEN; i++) {
+ mac[i] = simple_strtoul(s, &e, 16);
+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+ kfree(macstr);
+ return -ENOENT;
+ }
+ s = e+1;
+ }
+
+ kfree(macstr);
+ return 0;
+}
+
+static void connect(struct backend_info *be)
+{
+ int err;
+ struct xenbus_device *dev = be->dev;
+
+ err = connect_rings(be);
+ if (err)
+ return;
+
+ err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+ return;
+ }
+
+ xen_net_read_rate(dev, &be->netif->credit_bytes,
+ &be->netif->credit_usec);
+ be->netif->remaining_credit = be->netif->credit_bytes;
+
+ xenbus_switch_state(dev, XenbusStateConnected);
+
+ netif_wake_queue(be->netif->dev);
+}
+
+
+static int connect_rings(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ unsigned long tx_ring_ref, rx_ring_ref;
+ unsigned int evtchn, rx_copy;
+ int err;
+ int val;
+
+ DPRINTK("");
+
+ err = xenbus_gather(XBT_NIL, dev->otherend,
+ "tx-ring-ref", "%lu", &tx_ring_ref,
+ "rx-ring-ref", "%lu", &rx_ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+ &rx_copy);
+ if (err == -ENOENT) {
+ err = 0;
+ rx_copy = 0;
+ }
+ if (err < 0) {
+ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+ dev->otherend);
+ return err;
+ }
+ be->netif->copying_receiver = !!rx_copy;
+
+ if (be->netif->dev->tx_queue_len != 0) {
+ if (xenbus_scanf(XBT_NIL, dev->otherend,
+ "feature-rx-notify", "%d", &val) < 0)
+ val = 0;
+ if (val)
+ be->netif->can_queue = 1;
+ else
+ /* Must be non-zero for pfifo_fast to work. */
+ be->netif->dev->tx_queue_len = 1;
+ }
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
+ val = 0;
+ if (val) {
+ be->netif->features |= NETIF_F_SG;
+ be->netif->dev->features |= NETIF_F_SG;
+ }
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
+ &val) < 0)
+ val = 0;
+ if (val) {
+ be->netif->features |= NETIF_F_TSO;
+ be->netif->dev->features |= NETIF_F_TSO;
+ }
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
+ "%d", &val) < 0)
+ val = 0;
+ if (val) {
+ be->netif->features &= ~NETIF_F_IP_CSUM;
+ be->netif->dev->features &= ~NETIF_F_IP_CSUM;
+ }
+
+ /* Map the shared frame, irq etc. */
+ err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "mapping shared-frames %lu/%lu port %u",
+ tx_ring_ref, rx_ring_ref, evtchn);
+ return err;
+ }
+ return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id netback_ids[] = {
+ { "vif" },
+ { "" }
+};
+
+
+static struct xenbus_driver netback = {
+ .name = "vif",
+ .ids = netback_ids,
+ .probe = netback_probe,
+ .remove = netback_remove,
+ .uevent = netback_uevent,
+ .otherend_changed = frontend_changed,
+};
+
+
+void netif_xenbus_init(void)
+{
+ if (xenbus_register_backend(&netback))
+ BUG();
+}
--- /dev/null
+
+obj-$(CONFIG_XEN_NETDEV_FRONTEND) := xennet.o
+
+xennet-objs := netfront.o accel.o
--- /dev/null
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+
+#include "netfront.h"
+
+#define DPRINTK(fmt, args...) \
+ pr_debug("netfront/accel (%s:%d) " fmt, \
+ __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) \
+ printk(KERN_INFO "netfront/accel: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+ printk(KERN_WARNING "netfront/accel: " fmt, ##args)
+
+static int netfront_remove_accelerator(struct netfront_info *np,
+ struct xenbus_device *dev);
+static int netfront_load_accelerator(struct netfront_info *np,
+ struct xenbus_device *dev,
+ const char *frontend);
+
+/*
+ * List of all netfront accelerator plugin modules available. Each
+ * list entry is of type struct netfront_accelerator.
+ */
+static struct list_head accelerators_list;
+
+/* Lock to protect access to accelerators_list */
+static spinlock_t accelerators_lock;
+
+/* Workqueue to process acceleration configuration changes */
+struct workqueue_struct *accel_watch_workqueue;
+
+/* Mutex to prevent concurrent loads and suspends, etc. */
+DEFINE_MUTEX(accelerator_mutex);
+
+void netif_init_accel(void)
+{
+ INIT_LIST_HEAD(&accelerators_list);
+ spin_lock_init(&accelerators_lock);
+
+ accel_watch_workqueue = create_workqueue("net_accel");
+}
+
+void netif_exit_accel(void)
+{
+ struct netfront_accelerator *accelerator, *tmp;
+ unsigned long flags;
+
+ flush_workqueue(accel_watch_workqueue);
+ destroy_workqueue(accel_watch_workqueue);
+
+ spin_lock_irqsave(&accelerators_lock, flags);
+
+ list_for_each_entry_safe(accelerator, tmp, &accelerators_list, link) {
+ BUG_ON(!list_empty(&accelerator->vif_states));
+
+ list_del(&accelerator->link);
+ kfree(accelerator->frontend);
+ kfree(accelerator);
+ }
+
+ spin_unlock_irqrestore(&accelerators_lock, flags);
+}
+
+
+/*
+ * Watch the configured accelerator and change plugin if it's modified
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+static void accel_watch_work(struct work_struct *context)
+#else
+static void accel_watch_work(void *context)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+ struct netfront_accel_vif_state *vif_state =
+ container_of(context, struct netfront_accel_vif_state,
+ accel_work);
+#else
+ struct netfront_accel_vif_state *vif_state =
+ (struct netfront_accel_vif_state *)context;
+#endif
+ struct netfront_info *np = vif_state->np;
+ char *accel_frontend;
+ int accel_len, rc = -1;
+
+ mutex_lock(&accelerator_mutex);
+
+ accel_frontend = xenbus_read(XBT_NIL, np->xbdev->otherend,
+ "accel-frontend", &accel_len);
+ if (IS_ERR(accel_frontend)) {
+ accel_frontend = NULL;
+ netfront_remove_accelerator(np, np->xbdev);
+ } else {
+ /* If this is the first time, request the accelerator,
+ otherwise only request one if it has changed */
+ if (vif_state->accel_frontend == NULL) {
+ rc = netfront_load_accelerator(np, np->xbdev,
+ accel_frontend);
+ } else {
+ if (strncmp(vif_state->accel_frontend, accel_frontend,
+ accel_len)) {
+ netfront_remove_accelerator(np, np->xbdev);
+ rc = netfront_load_accelerator(np, np->xbdev,
+ accel_frontend);
+ }
+ }
+ }
+
+ /* Get rid of previous state and replace with the new name */
+ if (vif_state->accel_frontend != NULL)
+ kfree(vif_state->accel_frontend);
+ vif_state->accel_frontend = accel_frontend;
+
+ mutex_unlock(&accelerator_mutex);
+
+ if (rc == 0) {
+ DPRINTK("requesting module %s\n", accel_frontend);
+ request_module("%s", accel_frontend);
+ /*
+ * Module should now call netfront_accelerator_loaded() once
+ * it's up and running, and we can continue from there
+ */
+ }
+}
+
+
+static void accel_watch_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ struct netfront_accel_vif_state *vif_state =
+ container_of(watch, struct netfront_accel_vif_state,
+ accel_watch);
+ queue_work(accel_watch_workqueue, &vif_state->accel_work);
+}
+
+
+void netfront_accelerator_add_watch(struct netfront_info *np)
+{
+ int err;
+
+ /* Check we're not trying to overwrite an existing watch */
+ BUG_ON(np->accel_vif_state.accel_watch.node != NULL);
+
+ /* Get a watch on the accelerator plugin */
+ err = xenbus_watch_path2(np->xbdev, np->xbdev->otherend,
+ "accel-frontend",
+ &np->accel_vif_state.accel_watch,
+ accel_watch_changed);
+ if (err) {
+ DPRINTK("%s: Failed to register accel watch: %d\n",
+ __FUNCTION__, err);
+ np->accel_vif_state.accel_watch.node = NULL;
+ }
+}
+
+
+static
+void netfront_accelerator_remove_watch(struct netfront_info *np)
+{
+ struct netfront_accel_vif_state *vif_state = &np->accel_vif_state;
+
+ /* Get rid of watch on accelerator plugin */
+ if (vif_state->accel_watch.node != NULL) {
+ unregister_xenbus_watch(&vif_state->accel_watch);
+ kfree(vif_state->accel_watch.node);
+ vif_state->accel_watch.node = NULL;
+
+ flush_workqueue(accel_watch_workqueue);
+
+ /* Clean up any state left from watch */
+ if (vif_state->accel_frontend != NULL) {
+ kfree(vif_state->accel_frontend);
+ vif_state->accel_frontend = NULL;
+ }
+ }
+}
+
+
+/*
+ * Initialise the accel_vif_state field in the netfront state
+ */
+void init_accelerator_vif(struct netfront_info *np,
+ struct xenbus_device *dev)
+{
+ np->accelerator = NULL;
+
+ /* It's assumed that these things don't change */
+ np->accel_vif_state.np = np;
+ np->accel_vif_state.dev = dev;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+ INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work);
+#else
+ INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work,
+ &np->accel_vif_state);
+#endif
+}
+
+
+/*
+ * Compare a frontend description string against an accelerator to see
+ * if they match. Would ultimately be nice to replace the string with
+ * a unique numeric identifier for each accelerator.
+ */
+static int match_accelerator(const char *frontend,
+ struct netfront_accelerator *accelerator)
+{
+ return strcmp(frontend, accelerator->frontend) == 0;
+}
+
+
+/*
+ * Add a frontend vif to the list of vifs that is using a netfront
+ * accelerator plugin module.
+ */
+static void add_accelerator_vif(struct netfront_accelerator *accelerator,
+ struct netfront_info *np)
+{
+ unsigned long flags;
+
+ /* Need lock to write list */
+ spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+ if (np->accelerator == NULL) {
+ np->accelerator = accelerator;
+
+ list_add(&np->accel_vif_state.link, &accelerator->vif_states);
+ } else {
+ /*
+ * May get here legitimately if suspend_cancel is
+ * called, but in that case configuration should not
+ * have changed
+ */
+ BUG_ON(np->accelerator != accelerator);
+ }
+
+ spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+}
+
+
+/*
+ * Initialise the state to track an accelerator plugin module.
+ */
+static int init_accelerator(const char *frontend,
+ struct netfront_accelerator **result,
+ struct netfront_accel_hooks *hooks)
+{
+ struct netfront_accelerator *accelerator =
+ kmalloc(sizeof(struct netfront_accelerator), GFP_KERNEL);
+ unsigned long flags;
+ int frontend_len;
+
+ if (!accelerator) {
+ DPRINTK("no memory for accelerator\n");
+ return -ENOMEM;
+ }
+
+ frontend_len = strlen(frontend) + 1;
+ accelerator->frontend = kmalloc(frontend_len, GFP_KERNEL);
+ if (!accelerator->frontend) {
+ DPRINTK("no memory for accelerator\n");
+ kfree(accelerator);
+ return -ENOMEM;
+ }
+ strlcpy(accelerator->frontend, frontend, frontend_len);
+
+ INIT_LIST_HEAD(&accelerator->vif_states);
+ spin_lock_init(&accelerator->vif_states_lock);
+
+ accelerator->hooks = hooks;
+
+ spin_lock_irqsave(&accelerators_lock, flags);
+ list_add(&accelerator->link, &accelerators_list);
+ spin_unlock_irqrestore(&accelerators_lock, flags);
+
+ *result = accelerator;
+
+ return 0;
+}
+
+
+/*
+ * Modify the hooks stored in the per-vif state to match that in the
+ * netfront accelerator's state.
+ */
+static void
+accelerator_set_vif_state_hooks(struct netfront_accel_vif_state *vif_state)
+{
+ /* This function must be called with the vif_states_lock held */
+
+ DPRINTK("%p\n",vif_state);
+
+ /* Make sure there are no data path operations going on */
+ napi_disable(&vif_state->np->napi);
+ netif_tx_lock_bh(vif_state->np->netdev);
+
+ vif_state->hooks = vif_state->np->accelerator->hooks;
+
+ netif_tx_unlock_bh(vif_state->np->netdev);
+ napi_enable(&vif_state->np->napi);
+}
+
+
+static void accelerator_probe_new_vif(struct netfront_info *np,
+ struct xenbus_device *dev,
+ struct netfront_accelerator *accelerator)
+{
+ struct netfront_accel_hooks *hooks;
+ unsigned long flags;
+
+ DPRINTK("\n");
+
+ /* Include this frontend device on the accelerator's list */
+ add_accelerator_vif(accelerator, np);
+
+ hooks = accelerator->hooks;
+
+ if (hooks) {
+ if (hooks->new_device(np->netdev, dev) == 0) {
+ spin_lock_irqsave
+ (&accelerator->vif_states_lock, flags);
+
+ accelerator_set_vif_state_hooks(&np->accel_vif_state);
+
+ spin_unlock_irqrestore
+ (&accelerator->vif_states_lock, flags);
+ }
+ }
+
+ return;
+}
+
+
+/*
+ * Request that a particular netfront accelerator plugin is loaded.
+ * Usually called as a result of the vif configuration specifying
+ * which one to use. Must be called with accelerator_mutex held
+ */
+static int netfront_load_accelerator(struct netfront_info *np,
+ struct xenbus_device *dev,
+ const char *frontend)
+{
+ struct netfront_accelerator *accelerator;
+ int rc = 0;
+
+ DPRINTK(" %s\n", frontend);
+
+ /*
+ * Look at list of loaded accelerators to see if the requested
+ * one is already there
+ */
+ list_for_each_entry(accelerator, &accelerators_list, link) {
+ if (match_accelerator(frontend, accelerator)) {
+ accelerator_probe_new_vif(np, dev, accelerator);
+ return 0;
+ }
+ }
+
+ /* Couldn't find it, so create a new one and load the module */
+ if ((rc = init_accelerator(frontend, &accelerator, NULL)) < 0) {
+ return rc;
+ }
+
+ /* Include this frontend device on the accelerator's list */
+ add_accelerator_vif(accelerator, np);
+
+ return rc;
+}
+
+
+/*
+ * Go through all the netfront vifs and see if they have requested
+ * this accelerator. Notify the accelerator plugin of the relevant
+ * device if so. Called when an accelerator plugin module is first
+ * loaded and connects to netfront.
+ */
+static void
+accelerator_probe_vifs(struct netfront_accelerator *accelerator,
+ struct netfront_accel_hooks *hooks)
+{
+ struct netfront_accel_vif_state *vif_state, *tmp;
+ unsigned long flags;
+
+ DPRINTK("%p\n", accelerator);
+
+ /*
+ * Store the hooks for future calls to probe a new device, and
+ * to wire into the vif_state once the accelerator plugin is
+ * ready to accelerate each vif
+ */
+ BUG_ON(hooks == NULL);
+ accelerator->hooks = hooks;
+
+ /*
+ * currently hold accelerator_mutex, so don't need
+ * vif_states_lock to read the list
+ */
+ list_for_each_entry_safe(vif_state, tmp, &accelerator->vif_states,
+ link) {
+ struct netfront_info *np = vif_state->np;
+
+ if (hooks->new_device(np->netdev, vif_state->dev) == 0) {
+ spin_lock_irqsave
+ (&accelerator->vif_states_lock, flags);
+
+ accelerator_set_vif_state_hooks(vif_state);
+
+ spin_unlock_irqrestore
+ (&accelerator->vif_states_lock, flags);
+ }
+ }
+}
+
+
+/*
+ * Called by the netfront accelerator plugin module when it has loaded
+ */
+int netfront_accelerator_loaded(int version, const char *frontend,
+ struct netfront_accel_hooks *hooks)
+{
+ struct netfront_accelerator *accelerator;
+
+ if (is_initial_xendomain())
+ return -EINVAL;
+
+ if (version != NETFRONT_ACCEL_VERSION) {
+ if (version > NETFRONT_ACCEL_VERSION) {
+ /* Caller has higher version number, leave it
+ up to them to decide whether to continue.
+ They can re-call with a lower number if
+ they're happy to be compatible with us */
+ return NETFRONT_ACCEL_VERSION;
+ } else {
+ /* We have a more recent version than caller.
+ Currently reject, but may in future be able
+ to be backwardly compatible */
+ return -EPROTO;
+ }
+ }
+
+ mutex_lock(&accelerator_mutex);
+
+ /*
+ * Look through list of accelerators to see if it has already
+ * been requested
+ */
+ list_for_each_entry(accelerator, &accelerators_list, link) {
+ if (match_accelerator(frontend, accelerator)) {
+ accelerator_probe_vifs(accelerator, hooks);
+ goto out;
+ }
+ }
+
+ /*
+ * If it wasn't in the list, add it now so that when it is
+ * requested the caller will find it
+ */
+ DPRINTK("Couldn't find matching accelerator (%s)\n",
+ frontend);
+
+ init_accelerator(frontend, &accelerator, hooks);
+
+ out:
+ mutex_unlock(&accelerator_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netfront_accelerator_loaded);
+
+
+/*
+ * Remove the hooks from a single vif state.
+ */
+static void
+accelerator_remove_single_hook(struct netfront_accelerator *accelerator,
+ struct netfront_accel_vif_state *vif_state)
+{
+ /* Make sure there are no data path operations going on */
+ napi_disable(&vif_state->np->napi);
+ netif_tx_lock_bh(vif_state->np->netdev);
+
+ /*
+ * Remove the hooks, but leave the vif_state on the
+ * accelerator's list as that signifies this vif is
+ * interested in using that accelerator if it becomes
+ * available again
+ */
+ vif_state->hooks = NULL;
+
+ netif_tx_unlock_bh(vif_state->np->netdev);
+ napi_enable(&vif_state->np->napi);
+}
+
+
+/*
+ * Safely remove the accelerator function hooks from a netfront state.
+ */
+static void accelerator_remove_hooks(struct netfront_accelerator *accelerator)
+{
+ struct netfront_accel_hooks *hooks;
+ struct netfront_accel_vif_state *vif_state, *tmp;
+ unsigned long flags;
+
+ /* Mutex is held so don't need vif_states_lock to iterate list */
+ list_for_each_entry_safe(vif_state, tmp,
+ &accelerator->vif_states,
+ link) {
+ spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+ if(vif_state->hooks) {
+ hooks = vif_state->hooks;
+
+ /* Last chance to get statistics from the accelerator */
+ hooks->get_stats(vif_state->np->netdev,
+ &vif_state->np->stats);
+
+ spin_unlock_irqrestore(&accelerator->vif_states_lock,
+ flags);
+
+ accelerator_remove_single_hook(accelerator, vif_state);
+
+ accelerator->hooks->remove(vif_state->dev);
+ } else {
+ spin_unlock_irqrestore(&accelerator->vif_states_lock,
+ flags);
+ }
+ }
+
+ accelerator->hooks = NULL;
+}
+
+
+/*
+ * Called by a netfront accelerator when it is unloaded. This safely
+ * removes the hooks into the plugin and blocks until all devices have
+ * finished using it, so on return it is safe to unload.
+ */
+void netfront_accelerator_stop(const char *frontend)
+{
+ struct netfront_accelerator *accelerator;
+ unsigned long flags;
+
+ mutex_lock(&accelerator_mutex);
+ spin_lock_irqsave(&accelerators_lock, flags);
+
+ list_for_each_entry(accelerator, &accelerators_list, link) {
+ if (match_accelerator(frontend, accelerator)) {
+ spin_unlock_irqrestore(&accelerators_lock, flags);
+
+ accelerator_remove_hooks(accelerator);
+
+ goto out;
+ }
+ }
+ spin_unlock_irqrestore(&accelerators_lock, flags);
+ out:
+ mutex_unlock(&accelerator_mutex);
+}
+EXPORT_SYMBOL_GPL(netfront_accelerator_stop);
+
+
+/* Helper for call_remove and do_suspend */
+static int do_remove(struct netfront_info *np, struct xenbus_device *dev,
+ unsigned long *lock_flags)
+{
+ struct netfront_accelerator *accelerator = np->accelerator;
+ struct netfront_accel_hooks *hooks;
+ int rc = 0;
+
+ if (np->accel_vif_state.hooks) {
+ hooks = np->accel_vif_state.hooks;
+
+ /* Last chance to get statistics from the accelerator */
+ hooks->get_stats(np->netdev, &np->stats);
+
+ spin_unlock_irqrestore(&accelerator->vif_states_lock,
+ *lock_flags);
+
+ /*
+ * Try and do the opposite of accelerator_probe_new_vif
+ * to ensure there's no state pointing back at the
+ * netdev
+ */
+ accelerator_remove_single_hook(accelerator,
+ &np->accel_vif_state);
+
+ rc = accelerator->hooks->remove(dev);
+
+ spin_lock_irqsave(&accelerator->vif_states_lock, *lock_flags);
+ }
+
+ return rc;
+}
+
+
+static int netfront_remove_accelerator(struct netfront_info *np,
+ struct xenbus_device *dev)
+{
+ struct netfront_accelerator *accelerator;
+ struct netfront_accel_vif_state *tmp_vif_state;
+ unsigned long flags;
+ int rc = 0;
+
+ /* Check that we've got a device that was accelerated */
+ if (np->accelerator == NULL)
+ return rc;
+
+ accelerator = np->accelerator;
+
+ spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+ list_for_each_entry(tmp_vif_state, &accelerator->vif_states,
+ link) {
+ if (tmp_vif_state == &np->accel_vif_state) {
+ list_del(&np->accel_vif_state.link);
+ break;
+ }
+ }
+
+ rc = do_remove(np, dev, &flags);
+
+ np->accelerator = NULL;
+
+ spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+
+ return rc;
+}
+
+
+int netfront_accelerator_call_remove(struct netfront_info *np,
+ struct xenbus_device *dev)
+{
+ int rc;
+ netfront_accelerator_remove_watch(np);
+ mutex_lock(&accelerator_mutex);
+ rc = netfront_remove_accelerator(np, dev);
+ mutex_unlock(&accelerator_mutex);
+ return rc;
+}
+
+
+int netfront_accelerator_suspend(struct netfront_info *np,
+ struct xenbus_device *dev)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ netfront_accelerator_remove_watch(np);
+
+ mutex_lock(&accelerator_mutex);
+
+ /* Check that we've got a device that was accelerated */
+ if (np->accelerator == NULL)
+ goto out;
+
+ /*
+ * Call the remove accelerator hook, but leave the vif_state
+ * on the accelerator's list in case there is a suspend_cancel.
+ */
+ spin_lock_irqsave(&np->accelerator->vif_states_lock, flags);
+
+ rc = do_remove(np, dev, &flags);
+
+ spin_unlock_irqrestore(&np->accelerator->vif_states_lock, flags);
+ out:
+ mutex_unlock(&accelerator_mutex);
+ return rc;
+}
+
+
+int netfront_accelerator_suspend_cancel(struct netfront_info *np,
+ struct xenbus_device *dev)
+{
+ /*
+ * Setting the watch will cause it to fire and probe the
+ * accelerator, so no need to call accelerator_probe_new_vif()
+ * directly here
+ */
+ netfront_accelerator_add_watch(np);
+ return 0;
+}
+
+
+void netfront_accelerator_resume(struct netfront_info *np,
+ struct xenbus_device *dev)
+{
+ struct netfront_accel_vif_state *accel_vif_state = NULL;
+ spinlock_t *vif_states_lock;
+ unsigned long flags;
+
+ mutex_lock(&accelerator_mutex);
+
+ /* Check that we've got a device that was accelerated */
+ if(np->accelerator == NULL)
+ goto out;
+
+ /* Find the vif_state from the accelerator's list */
+ list_for_each_entry(accel_vif_state, &np->accelerator->vif_states,
+ link) {
+ if (accel_vif_state->dev == dev) {
+ BUG_ON(accel_vif_state != &np->accel_vif_state);
+
+ vif_states_lock = &np->accelerator->vif_states_lock;
+ spin_lock_irqsave(vif_states_lock, flags);
+
+ /*
+ * Remove it from the accelerator's list so
+ * state is consistent for probing new vifs
+ * when they get connected
+ */
+ list_del(&accel_vif_state->link);
+ np->accelerator = NULL;
+
+ spin_unlock_irqrestore(vif_states_lock, flags);
+
+ break;
+ }
+ }
+
+ out:
+ mutex_unlock(&accelerator_mutex);
+ return;
+}
+
+
+int netfront_check_accelerator_queue_ready(struct net_device *dev,
+ struct netfront_info *np)
+{
+ struct netfront_accelerator *accelerator;
+ struct netfront_accel_hooks *hooks;
+ int rc = 1;
+ unsigned long flags;
+
+ accelerator = np->accelerator;
+
+ /* Call the check_ready accelerator hook. */
+ if (np->accel_vif_state.hooks && accelerator) {
+ spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+ hooks = np->accel_vif_state.hooks;
+ if (hooks && np->accelerator == accelerator)
+ rc = np->accel_vif_state.hooks->check_ready(dev);
+ spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+ }
+
+ return rc;
+}
+
+
+void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
+ struct net_device *dev)
+{
+ struct netfront_accelerator *accelerator;
+ struct netfront_accel_hooks *hooks;
+ unsigned long flags;
+
+ accelerator = np->accelerator;
+
+ /* Call the stop_napi_interrupts accelerator hook. */
+ if (np->accel_vif_state.hooks && accelerator != NULL) {
+ spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+ hooks = np->accel_vif_state.hooks;
+ if (hooks && np->accelerator == accelerator)
+ np->accel_vif_state.hooks->stop_napi_irq(dev);
+ spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+ }
+}
+
+
+int netfront_accelerator_call_get_stats(struct netfront_info *np,
+ struct net_device *dev)
+{
+ struct netfront_accelerator *accelerator;
+ struct netfront_accel_hooks *hooks;
+ unsigned long flags;
+ int rc = 0;
+
+ accelerator = np->accelerator;
+
+ /* Call the get_stats accelerator hook. */
+ if (np->accel_vif_state.hooks && accelerator != NULL) {
+ spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+ hooks = np->accel_vif_state.hooks;
+ if (hooks && np->accelerator == accelerator)
+ rc = np->accel_vif_state.hooks->get_stats(dev,
+ &np->stats);
+ spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+ }
+ return rc;
+}
+
--- /dev/null
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/io.h>
+#include <linux/moduleparam.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <asm/uaccess.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/netif.h>
+#include <xen/interface/memory.h>
+#include <xen/balloon.h>
+#include <asm/page.h>
+#include <asm/maddr.h>
+#include <asm/uaccess.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+
+struct netfront_cb {
+ struct page *page;
+ unsigned offset;
+};
+
+#define NETFRONT_SKB_CB(skb) ((struct netfront_cb *)((skb)->cb))
+
+#include "netfront.h"
+
+/*
+ * Mutually-exclusive module options to select receive data path:
+ * rx_copy : Packets are copied by network backend into local memory
+ * rx_flip : Page containing packet data is transferred to our ownership
+ * For fully-virtualised guests there is no option - copying must be used.
+ * For paravirtualised guests, flipping is the default.
+ */
+#ifdef CONFIG_XEN
+static int MODPARM_rx_copy = 0;
+module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
+MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
+static int MODPARM_rx_flip = 0;
+module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
+MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
+#else
+static const int MODPARM_rx_copy = 1;
+static const int MODPARM_rx_flip = 0;
+#endif
+
+#define RX_COPY_THRESHOLD 256
+
+/* If we don't have GSO, fake things up so that we never try to use it. */
+#if defined(NETIF_F_GSO)
+#define HAVE_GSO 1
+#define HAVE_TSO 1 /* TSO is a subset of GSO */
+#define HAVE_CSUM_OFFLOAD 1
+static inline void dev_disable_gso_features(struct net_device *dev)
+{
+ /* Turn off all GSO bits except ROBUST. */
+ dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
+ dev->features |= NETIF_F_GSO_ROBUST;
+}
+#elif defined(NETIF_F_TSO)
+#define HAVE_GSO 0
+#define HAVE_TSO 1
+
+/* Some older kernels cannot cope with incorrect checksums,
+ * particularly in netfilter. I'm not sure there is 100% correlation
+ * with the presence of NETIF_F_TSO but it appears to be a good first
+ * approximiation.
+ */
+#define HAVE_CSUM_OFFLOAD 0
+
+#define gso_size tso_size
+#define gso_segs tso_segs
+static inline void dev_disable_gso_features(struct net_device *dev)
+{
+ /* Turn off all TSO bits. */
+ dev->features &= ~NETIF_F_TSO;
+}
+static inline int skb_is_gso(const struct sk_buff *skb)
+{
+ return skb_shinfo(skb)->tso_size;
+}
+static inline int skb_gso_ok(struct sk_buff *skb, int features)
+{
+ return (features & NETIF_F_TSO);
+}
+
+static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
+{
+ return skb_is_gso(skb) &&
+ (!skb_gso_ok(skb, dev->features) ||
+ unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
+}
+#else
+#define HAVE_GSO 0
+#define HAVE_TSO 0
+#define HAVE_CSUM_OFFLOAD 0
+#define netif_needs_gso(dev, skb) 0
+#define dev_disable_gso_features(dev) ((void)0)
+#define ethtool_op_set_tso(dev, data) (-ENOSYS)
+#endif
+
+#define GRANT_INVALID_REF 0
+
+struct netfront_rx_info {
+ struct netif_rx_response rx;
+ struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+};
+
+/*
+ * Implement our own carrier flag: the network stack's version causes delays
+ * when the carrier is re-enabled (in particular, dev_activate() may not
+ * immediately be called, which can cause packet loss).
+ */
+#define netfront_carrier_on(netif) ((netif)->carrier = 1)
+#define netfront_carrier_off(netif) ((netif)->carrier = 0)
+#define netfront_carrier_ok(netif) ((netif)->carrier)
+
+/*
+ * Access macros for acquiring freeing slots in tx_skbs[].
+ */
+
+static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
+{
+ list[id] = list[0];
+ list[0] = (void *)(unsigned long)id;
+}
+
+static inline unsigned short get_id_from_freelist(struct sk_buff **list)
+{
+ unsigned int id = (unsigned int)(unsigned long)list[0];
+ list[0] = list[id];
+ return id;
+}
+
+static inline int xennet_rxidx(RING_IDX idx)
+{
+ return idx & (NET_RX_RING_SIZE - 1);
+}
+
+static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
+ RING_IDX ri)
+{
+ int i = xennet_rxidx(ri);
+ struct sk_buff *skb = np->rx_skbs[i];
+ np->rx_skbs[i] = NULL;
+ return skb;
+}
+
+static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
+ RING_IDX ri)
+{
+ int i = xennet_rxidx(ri);
+ grant_ref_t ref = np->grant_rx_ref[i];
+ np->grant_rx_ref[i] = GRANT_INVALID_REF;
+ return ref;
+}
+
+#define DPRINTK(fmt, args...) \
+ pr_debug("netfront (%s:%d) " fmt, \
+ __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) \
+ printk(KERN_INFO "netfront: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+ printk(KERN_WARNING "netfront: " fmt, ##args)
+
+static int setup_device(struct xenbus_device *, struct netfront_info *);
+static struct net_device *create_netdev(struct xenbus_device *);
+
+static void end_access(int, void *);
+static void netif_disconnect_backend(struct netfront_info *);
+
+static int network_connect(struct net_device *);
+static void network_tx_buf_gc(struct net_device *);
+static void network_alloc_rx_buffers(struct net_device *);
+static void send_fake_arp(struct net_device *);
+
+static irqreturn_t netif_int(int irq, void *dev_id);
+
+#ifdef CONFIG_SYSFS
+static int xennet_sysfs_addif(struct net_device *netdev);
+static void xennet_sysfs_delif(struct net_device *netdev);
+#else /* !CONFIG_SYSFS */
+#define xennet_sysfs_addif(dev) (0)
+#define xennet_sysfs_delif(dev) do { } while(0)
+#endif
+
+static inline int xennet_can_sg(struct net_device *dev)
+{
+ return dev->features & NETIF_F_SG;
+}
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures and the ring buffers for communication with the backend, and
+ * inform the backend of the appropriate details for those.
+ */
+static int __devinit netfront_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ struct net_device *netdev;
+ struct netfront_info *info;
+
+ netdev = create_netdev(dev);
+ if (IS_ERR(netdev)) {
+ err = PTR_ERR(netdev);
+ xenbus_dev_fatal(dev, err, "creating netdev");
+ return err;
+ }
+
+ info = netdev_priv(netdev);
+ dev->dev.driver_data = info;
+
+ err = register_netdev(info->netdev);
+ if (err) {
+ printk(KERN_WARNING "%s: register_netdev err=%d\n",
+ __FUNCTION__, err);
+ goto fail;
+ }
+
+ err = xennet_sysfs_addif(info->netdev);
+ if (err) {
+ unregister_netdev(info->netdev);
+ printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
+ __FUNCTION__, err);
+ goto fail;
+ }
+
+ return 0;
+
+ fail:
+ free_netdev(netdev);
+ dev->dev.driver_data = NULL;
+ return err;
+}
+
+static int __devexit netfront_remove(struct xenbus_device *dev)
+{
+ struct netfront_info *info = dev->dev.driver_data;
+
+ DPRINTK("%s\n", dev->nodename);
+
+ netfront_accelerator_call_remove(info, dev);
+
+ netif_disconnect_backend(info);
+
+ del_timer_sync(&info->rx_refill_timer);
+
+ xennet_sysfs_delif(info->netdev);
+
+ unregister_netdev(info->netdev);
+
+ free_netdev(info->netdev);
+
+ return 0;
+}
+
+
+static int netfront_suspend(struct xenbus_device *dev)
+{
+ struct netfront_info *info = dev->dev.driver_data;
+ return netfront_accelerator_suspend(info, dev);
+}
+
+
+static int netfront_suspend_cancel(struct xenbus_device *dev)
+{
+ struct netfront_info *info = dev->dev.driver_data;
+ return netfront_accelerator_suspend_cancel(info, dev);
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart. We tear down our netif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int netfront_resume(struct xenbus_device *dev)
+{
+ struct netfront_info *info = dev->dev.driver_data;
+
+ DPRINTK("%s\n", dev->nodename);
+
+ netfront_accelerator_resume(info, dev);
+
+ netif_disconnect_backend(info);
+ return 0;
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+ char *s, *e, *macstr;
+ int i;
+
+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+ if (IS_ERR(macstr))
+ return PTR_ERR(macstr);
+
+ for (i = 0; i < ETH_ALEN; i++) {
+ mac[i] = simple_strtoul(s, &e, 16);
+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+ kfree(macstr);
+ return -ENOENT;
+ }
+ s = e+1;
+ }
+
+ kfree(macstr);
+ return 0;
+}
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+ struct netfront_info *info)
+{
+ const char *message;
+ struct xenbus_transaction xbt;
+ int err;
+
+ /* Read mac only in the first setup. */
+ if (!is_valid_ether_addr(info->mac)) {
+ err = xen_net_read_mac(dev, info->mac);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "parsing %s/mac",
+ dev->nodename);
+ goto out;
+ }
+ }
+
+ /* Create shared ring, alloc event channel. */
+ err = setup_device(dev, info);
+ if (err)
+ goto out;
+
+ /* This will load an accelerator if one is configured when the
+ * watch fires */
+ netfront_accelerator_add_watch(info);
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ goto destroy_ring;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
+ info->tx_ring_ref);
+ if (err) {
+ message = "writing tx ring-ref";
+ goto abort_transaction;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
+ info->rx_ring_ref);
+ if (err) {
+ message = "writing rx ring-ref";
+ goto abort_transaction;
+ }
+ err = xenbus_printf(xbt, dev->nodename,
+ "event-channel", "%u",
+ irq_to_evtchn_port(info->irq));
+ if (err) {
+ message = "writing event-channel";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
+ info->copying_receiver);
+ if (err) {
+ message = "writing request-rx-copy";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
+ if (err) {
+ message = "writing feature-rx-notify";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload",
+ "%d", !HAVE_CSUM_OFFLOAD);
+ if (err) {
+ message = "writing feature-no-csum-offload";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+ if (err) {
+ message = "writing feature-sg";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d",
+ HAVE_TSO);
+ if (err) {
+ message = "writing feature-gso-tcpv4";
+ goto abort_transaction;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err) {
+ if (err == -EAGAIN)
+ goto again;
+ xenbus_dev_fatal(dev, err, "completing transaction");
+ goto destroy_ring;
+ }
+
+ return 0;
+
+ abort_transaction:
+ xenbus_transaction_end(xbt, 1);
+ xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_ring:
+ netfront_accelerator_call_remove(info, dev);
+ netif_disconnect_backend(info);
+ out:
+ return err;
+}
+
+static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
+{
+ struct netif_tx_sring *txs;
+ struct netif_rx_sring *rxs;
+ int err;
+ struct net_device *netdev = info->netdev;
+
+ info->tx_ring_ref = GRANT_INVALID_REF;
+ info->rx_ring_ref = GRANT_INVALID_REF;
+ info->rx.sring = NULL;
+ info->tx.sring = NULL;
+ info->irq = 0;
+
+ txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL|__GFP_HIGH);
+ if (!txs) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(dev, err, "allocating tx ring page");
+ goto fail;
+ }
+ SHARED_RING_INIT(txs);
+ FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
+
+ err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+ if (err < 0) {
+ free_page((unsigned long)txs);
+ goto fail;
+ }
+ info->tx_ring_ref = err;
+
+ rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL|__GFP_HIGH);
+ if (!rxs) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(dev, err, "allocating rx ring page");
+ goto fail;
+ }
+ SHARED_RING_INIT(rxs);
+ FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
+
+ err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+ if (err < 0) {
+ free_page((unsigned long)rxs);
+ goto fail;
+ }
+ info->rx_ring_ref = err;
+
+ memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
+
+ err = bind_listening_port_to_irqhandler(
+ dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name,
+ netdev);
+ if (err < 0)
+ goto fail;
+ info->irq = err;
+
+ return 0;
+
+ fail:
+ return err;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+{
+ struct netfront_info *np = dev->dev.driver_data;
+ struct net_device *netdev = np->netdev;
+
+ DPRINTK("%s\n", xenbus_strstate(backend_state));
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ break;
+
+ case XenbusStateInitWait:
+ if (dev->state != XenbusStateInitialising)
+ break;
+ if (network_connect(netdev) != 0)
+ break;
+ xenbus_switch_state(dev, XenbusStateConnected);
+ send_fake_arp(netdev);
+ break;
+
+ case XenbusStateClosing:
+ xenbus_frontend_closed(dev);
+ break;
+ }
+}
+
+/** Send a packet on a net device to encourage switches to learn the
+ * MAC. We send a fake ARP request.
+ *
+ * @param dev device
+ * @return 0 on success, error code otherwise
+ */
+static void send_fake_arp(struct net_device *dev)
+{
+#ifdef CONFIG_INET
+ struct sk_buff *skb;
+ u32 src_ip, dst_ip;
+
+ dst_ip = INADDR_BROADCAST;
+ src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
+
+ /* No IP? Then nothing to do. */
+ if (src_ip == 0)
+ return;
+
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
+ dst_ip, dev, src_ip,
+ /*dst_hw*/ NULL, /*src_hw*/ NULL,
+ /*target_hw*/ dev->dev_addr);
+ if (skb == NULL)
+ return;
+
+ dev_queue_xmit(skb);
+#endif
+}
+
+static inline int netfront_tx_slot_available(struct netfront_info *np)
+{
+ return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
+ (TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
+}
+
+
+static inline void network_maybe_wake_tx(struct net_device *dev)
+{
+ struct netfront_info *np = netdev_priv(dev);
+
+ if (unlikely(netif_queue_stopped(dev)) &&
+ netfront_tx_slot_available(np) &&
+ likely(netif_running(dev)) &&
+ netfront_check_accelerator_queue_ready(dev, np))
+ netif_wake_queue(dev);
+}
+
+
+int netfront_check_queue_ready(struct net_device *dev)
+{
+ struct netfront_info *np = netdev_priv(dev);
+
+ return unlikely(netif_queue_stopped(dev)) &&
+ netfront_tx_slot_available(np) &&
+ likely(netif_running(dev));
+}
+EXPORT_SYMBOL(netfront_check_queue_ready);
+
+
+static int network_open(struct net_device *dev)
+{
+ struct netfront_info *np = netdev_priv(dev);
+
+ memset(&np->stats, 0, sizeof(np->stats));
+ napi_enable(&np->napi);
+
+ spin_lock_bh(&np->rx_lock);
+ if (netfront_carrier_ok(np)) {
+ network_alloc_rx_buffers(dev);
+ np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
+ netfront_accelerator_call_stop_napi_irq(np, dev);
+
+ netif_rx_schedule(dev, &np->napi);
+ }
+ }
+ spin_unlock_bh(&np->rx_lock);
+
+ network_maybe_wake_tx(dev);
+
+ return 0;
+}
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+ RING_IDX cons, prod;
+ unsigned short id;
+ struct netfront_info *np = netdev_priv(dev);
+ struct sk_buff *skb;
+
+ BUG_ON(!netfront_carrier_ok(np));
+
+ do {
+ prod = np->tx.sring->rsp_prod;
+ rmb(); /* Ensure we see responses up to 'rp'. */
+
+ for (cons = np->tx.rsp_cons; cons != prod; cons++) {
+ struct netif_tx_response *txrsp;
+
+ txrsp = RING_GET_RESPONSE(&np->tx, cons);
+ if (txrsp->status == NETIF_RSP_NULL)
+ continue;
+
+ id = txrsp->id;
+ skb = np->tx_skbs[id];
+ if (unlikely(gnttab_query_foreign_access(
+ np->grant_tx_ref[id]) != 0)) {
+ printk(KERN_ALERT "network_tx_buf_gc: warning "
+ "-- grant still in use by backend "
+ "domain.\n");
+ BUG();
+ }
+ gnttab_end_foreign_access_ref(np->grant_tx_ref[id]);
+ gnttab_release_grant_reference(
+ &np->gref_tx_head, np->grant_tx_ref[id]);
+ np->grant_tx_ref[id] = GRANT_INVALID_REF;
+ add_id_to_freelist(np->tx_skbs, id);
+ dev_kfree_skb_irq(skb);
+ }
+
+ np->tx.rsp_cons = prod;
+
+ /*
+ * Set a new event, then check for race with update of tx_cons.
+ * Note that it is essential to schedule a callback, no matter
+ * how few buffers are pending. Even if there is space in the
+ * transmit ring, higher layers may be blocked because too much
+ * data is outstanding: in such cases notification from Xen is
+ * likely to be the only kick that we'll get.
+ */
+ np->tx.sring->rsp_event =
+ prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
+ mb();
+ } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
+
+ network_maybe_wake_tx(dev);
+}
+
+static void rx_refill_timeout(unsigned long data)
+{
+ struct net_device *dev = (struct net_device *)data;
+ struct netfront_info *np = netdev_priv(dev);
+
+ netfront_accelerator_call_stop_napi_irq(np, dev);
+
+ netif_rx_schedule(dev, &np->napi);
+}
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+ unsigned short id;
+ struct netfront_info *np = netdev_priv(dev);
+ struct sk_buff *skb;
+ struct page *page;
+ int i, batch_target, notify;
+ RING_IDX req_prod = np->rx.req_prod_pvt;
+ struct xen_memory_reservation reservation;
+ grant_ref_t ref;
+ unsigned long pfn;
+ void *vaddr;
+ int nr_flips;
+ netif_rx_request_t *req;
+
+ if (unlikely(!netfront_carrier_ok(np)))
+ return;
+
+ /*
+ * Allocate skbuffs greedily, even though we batch updates to the
+ * receive ring. This creates a less bursty demand on the memory
+ * allocator, so should reduce the chance of failed allocation requests
+ * both for ourself and for other kernel subsystems.
+ */
+ batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
+ for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
+ /*
+ * Allocate an skb and a page. Do not use __dev_alloc_skb as
+ * that will allocate page-sized buffers which is not
+ * necessary here.
+ * 16 bytes added as necessary headroom for netif_receive_skb.
+ */
+ skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!skb))
+ goto no_skb;
+
+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+ if (!page) {
+ kfree_skb(skb);
+no_skb:
+ /* Any skbuffs queued for refill? Force them out. */
+ if (i != 0)
+ goto refill;
+ /* Could not allocate any skbuffs. Try again later. */
+ mod_timer(&np->rx_refill_timer,
+ jiffies + (HZ/10));
+ break;
+ }
+
+ skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
+ skb_shinfo(skb)->frags[0].page = page;
+ skb_shinfo(skb)->nr_frags = 1;
+ __skb_queue_tail(&np->rx_batch, skb);
+ }
+
+ /* Is the batch large enough to be worthwhile? */
+ if (i < (np->rx_target/2)) {
+ if (req_prod > np->rx.sring->req_prod)
+ goto push;
+ return;
+ }
+
+ /* Adjust our fill target if we risked running out of buffers. */
+ if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
+ ((np->rx_target *= 2) > np->rx_max_target))
+ np->rx_target = np->rx_max_target;
+
+ refill:
+ for (nr_flips = i = 0; ; i++) {
+ if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
+ break;
+
+ skb->dev = dev;
+
+ id = xennet_rxidx(req_prod + i);
+
+ BUG_ON(np->rx_skbs[id]);
+ np->rx_skbs[id] = skb;
+
+ ref = gnttab_claim_grant_reference(&np->gref_rx_head);
+ BUG_ON((signed short)ref < 0);
+ np->grant_rx_ref[id] = ref;
+
+ pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
+ vaddr = page_address(skb_shinfo(skb)->frags[0].page);
+
+ req = RING_GET_REQUEST(&np->rx, req_prod + i);
+ if (!np->copying_receiver) {
+ gnttab_grant_foreign_transfer_ref(ref,
+ np->xbdev->otherend_id,
+ pfn);
+ np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Remove this page before passing
+ * back to Xen. */
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ MULTI_update_va_mapping(np->rx_mcl+i,
+ (unsigned long)vaddr,
+ __pte(0), 0);
+ }
+ nr_flips++;
+ } else {
+ gnttab_grant_foreign_access_ref(ref,
+ np->xbdev->otherend_id,
+ pfn_to_mfn(pfn),
+ 0);
+ }
+
+ req->id = id;
+ req->gref = ref;
+ }
+
+ if ( nr_flips != 0 ) {
+ /* Tell the ballon driver what is going on. */
+ balloon_update_driver_allowance(i);
+
+ set_xen_guest_handle(reservation.extent_start,
+ np->rx_pfn_array);
+ reservation.nr_extents = nr_flips;
+ reservation.extent_order = 0;
+ reservation.address_bits = 0;
+ reservation.domid = DOMID_SELF;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* After all PTEs have been zapped, flush the TLB. */
+ np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
+ UVMF_TLB_FLUSH|UVMF_ALL;
+
+ /* Give away a batch of pages. */
+ np->rx_mcl[i].op = __HYPERVISOR_memory_op;
+ np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
+ np->rx_mcl[i].args[1] = (unsigned long)&reservation;
+
+ /* Zap PTEs and give away pages in one big
+ * multicall. */
+ if (unlikely(HYPERVISOR_multicall(np->rx_mcl, i+1)))
+ BUG();
+
+ /* Check return status of HYPERVISOR_memory_op(). */
+ if (unlikely(np->rx_mcl[i].result != i))
+ panic("Unable to reduce memory reservation\n");
+ while (nr_flips--)
+ BUG_ON(np->rx_mcl[nr_flips].result);
+ } else {
+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation) != i)
+ panic("Unable to reduce memory reservation\n");
+ }
+ } else {
+ wmb();
+ }
+
+ /* Above is a suitable barrier to ensure backend will see requests. */
+ np->rx.req_prod_pvt = req_prod + i;
+ push:
+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
+ if (notify)
+ notify_remote_via_irq(np->irq);
+}
+
+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+ struct netif_tx_request *tx)
+{
+ struct netfront_info *np = netdev_priv(dev);
+ char *data = skb->data;
+ unsigned long mfn;
+ RING_IDX prod = np->tx.req_prod_pvt;
+ int frags = skb_shinfo(skb)->nr_frags;
+ unsigned int offset = offset_in_page(data);
+ unsigned int len = skb_headlen(skb);
+ unsigned int id;
+ grant_ref_t ref;
+ int i;
+
+ while (len > PAGE_SIZE - offset) {
+ tx->size = PAGE_SIZE - offset;
+ tx->flags |= NETTXF_more_data;
+ len -= tx->size;
+ data += tx->size;
+ offset = 0;
+
+ id = get_id_from_freelist(np->tx_skbs);
+ np->tx_skbs[id] = skb_get(skb);
+ tx = RING_GET_REQUEST(&np->tx, prod++);
+ tx->id = id;
+ ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+ BUG_ON((signed short)ref < 0);
+
+ mfn = virt_to_mfn(data);
+ gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+ mfn, GTF_readonly);
+
+ tx->gref = np->grant_tx_ref[id] = ref;
+ tx->offset = offset;
+ tx->size = len;
+ tx->flags = 0;
+ }
+
+ for (i = 0; i < frags; i++) {
+ skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+
+ tx->flags |= NETTXF_more_data;
+
+ id = get_id_from_freelist(np->tx_skbs);
+ np->tx_skbs[id] = skb_get(skb);
+ tx = RING_GET_REQUEST(&np->tx, prod++);
+ tx->id = id;
+ ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+ BUG_ON((signed short)ref < 0);
+
+ mfn = pfn_to_mfn(page_to_pfn(frag->page));
+ gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+ mfn, GTF_readonly);
+
+ tx->gref = np->grant_tx_ref[id] = ref;
+ tx->offset = frag->page_offset;
+ tx->size = frag->size;
+ tx->flags = 0;
+ }
+
+ np->tx.req_prod_pvt = prod;
+}
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned short id;
+ struct netfront_info *np = netdev_priv(dev);
+ struct netif_tx_request *tx;
+ struct netif_extra_info *extra;
+ char *data = skb->data;
+ RING_IDX i;
+ grant_ref_t ref;
+ unsigned long mfn;
+ int notify;
+ int frags = skb_shinfo(skb)->nr_frags;
+ unsigned int offset = offset_in_page(data);
+ unsigned int len = skb_headlen(skb);
+
+ /* Check the fast path, if hooks are available */
+ if (np->accel_vif_state.hooks &&
+ np->accel_vif_state.hooks->start_xmit(skb, dev)) {
+ /* Fast path has sent this packet */
+ return 0;
+ }
+
+ frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
+ if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
+ printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
+ frags);
+ dump_stack();
+ goto drop;
+ }
+
+ spin_lock_irq(&np->tx_lock);
+
+ if (unlikely(!netfront_carrier_ok(np) ||
+ (frags > 1 && !xennet_can_sg(dev)) ||
+ netif_needs_gso(dev, skb))) {
+ spin_unlock_irq(&np->tx_lock);
+ goto drop;
+ }
+
+ i = np->tx.req_prod_pvt;
+
+ id = get_id_from_freelist(np->tx_skbs);
+ np->tx_skbs[id] = skb;
+
+ tx = RING_GET_REQUEST(&np->tx, i);
+
+ tx->id = id;
+ ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+ BUG_ON((signed short)ref < 0);
+ mfn = virt_to_mfn(data);
+ gnttab_grant_foreign_access_ref(
+ ref, np->xbdev->otherend_id, mfn, GTF_readonly);
+ tx->gref = np->grant_tx_ref[id] = ref;
+ tx->offset = offset;
+ tx->size = len;
+
+ tx->flags = 0;
+ extra = NULL;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+ tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
+#ifdef CONFIG_XEN
+ if (skb->proto_data_valid) /* remote but checksummed? */
+ tx->flags |= NETTXF_data_validated;
+#endif
+
+#if HAVE_TSO
+ if (skb_shinfo(skb)->gso_size) {
+ struct netif_extra_info *gso = (struct netif_extra_info *)
+ RING_GET_REQUEST(&np->tx, ++i);
+
+ if (extra)
+ extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+ else
+ tx->flags |= NETTXF_extra_info;
+
+ gso->u.gso.size = skb_shinfo(skb)->gso_size;
+ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+ gso->u.gso.pad = 0;
+ gso->u.gso.features = 0;
+
+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+ gso->flags = 0;
+ extra = gso;
+ }
+#endif
+
+ np->tx.req_prod_pvt = i + 1;
+
+ xennet_make_frags(skb, dev, tx);
+ tx->size = skb->len;
+
+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
+ if (notify)
+ notify_remote_via_irq(np->irq);
+
+ np->stats.tx_bytes += skb->len;
+ np->stats.tx_packets++;
+ dev->trans_start = jiffies;
+
+ /* Note: It is not safe to access skb after network_tx_buf_gc()! */
+ network_tx_buf_gc(dev);
+
+ if (!netfront_tx_slot_available(np))
+ netif_stop_queue(dev);
+
+ spin_unlock_irq(&np->tx_lock);
+
+ return 0;
+
+ drop:
+ np->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+static irqreturn_t netif_int(int irq, void *dev_id)
+{
+ struct net_device *dev = dev_id;
+ struct netfront_info *np = netdev_priv(dev);
+ unsigned long flags;
+
+ spin_lock_irqsave(&np->tx_lock, flags);
+
+ if (likely(netfront_carrier_ok(np))) {
+ network_tx_buf_gc(dev);
+ /* Under tx_lock: protects access to rx shared-ring indexes. */
+ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
+ netfront_accelerator_call_stop_napi_irq(np, dev);
+
+ netif_rx_schedule(dev, &np->napi);
+ dev->last_rx = jiffies;
+ }
+ }
+
+ spin_unlock_irqrestore(&np->tx_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
+ grant_ref_t ref)
+{
+ int new = xennet_rxidx(np->rx.req_prod_pvt);
+
+ BUG_ON(np->rx_skbs[new]);
+ np->rx_skbs[new] = skb;
+ np->grant_rx_ref[new] = ref;
+ RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
+ RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
+ np->rx.req_prod_pvt++;
+}
+
+int xennet_get_extras(struct netfront_info *np,
+ struct netif_extra_info *extras, RING_IDX rp)
+
+{
+ struct netif_extra_info *extra;
+ RING_IDX cons = np->rx.rsp_cons;
+ int err = 0;
+
+ do {
+ struct sk_buff *skb;
+ grant_ref_t ref;
+
+ if (unlikely(cons + 1 == rp)) {
+ if (net_ratelimit())
+ WPRINTK("Missing extra info\n");
+ err = -EBADR;
+ break;
+ }
+
+ extra = (struct netif_extra_info *)
+ RING_GET_RESPONSE(&np->rx, ++cons);
+
+ if (unlikely(!extra->type ||
+ extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+ if (net_ratelimit())
+ WPRINTK("Invalid extra type: %d\n",
+ extra->type);
+ err = -EINVAL;
+ } else {
+ memcpy(&extras[extra->type - 1], extra,
+ sizeof(*extra));
+ }
+
+ skb = xennet_get_rx_skb(np, cons);
+ ref = xennet_get_rx_ref(np, cons);
+ xennet_move_rx_slot(np, skb, ref);
+ } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+ np->rx.rsp_cons = cons;
+ return err;
+}
+
+static int xennet_get_responses(struct netfront_info *np,
+ struct netfront_rx_info *rinfo, RING_IDX rp,
+ struct sk_buff_head *list,
+ int *pages_flipped_p)
+{
+ int pages_flipped = *pages_flipped_p;
+ struct mmu_update *mmu;
+ struct multicall_entry *mcl;
+ struct netif_rx_response *rx = &rinfo->rx;
+ struct netif_extra_info *extras = rinfo->extras;
+ RING_IDX cons = np->rx.rsp_cons;
+ struct sk_buff *skb = xennet_get_rx_skb(np, cons);
+ grant_ref_t ref = xennet_get_rx_ref(np, cons);
+ int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+ int frags = 1;
+ int err = 0;
+ unsigned long ret;
+
+ if (rx->flags & NETRXF_extra_info) {
+ err = xennet_get_extras(np, extras, rp);
+ cons = np->rx.rsp_cons;
+ }
+
+ for (;;) {
+ unsigned long mfn;
+
+ if (unlikely(rx->status < 0 ||
+ rx->offset + rx->status > PAGE_SIZE)) {
+ if (net_ratelimit())
+ WPRINTK("rx->offset: %x, size: %u\n",
+ rx->offset, rx->status);
+ xennet_move_rx_slot(np, skb, ref);
+ err = -EINVAL;
+ goto next;
+ }
+
+ /*
+ * This definitely indicates a bug, either in this driver or in
+ * the backend driver. In future this should flag the bad
+ * situation to the system controller to reboot the backed.
+ */
+ if (ref == GRANT_INVALID_REF) {
+ if (net_ratelimit())
+ WPRINTK("Bad rx response id %d.\n", rx->id);
+ err = -EINVAL;
+ goto next;
+ }
+
+ if (!np->copying_receiver) {
+ /* Memory pressure, insufficient buffer
+ * headroom, ... */
+ if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
+ if (net_ratelimit())
+ WPRINTK("Unfulfilled rx req "
+ "(id=%d, st=%d).\n",
+ rx->id, rx->status);
+ xennet_move_rx_slot(np, skb, ref);
+ err = -ENOMEM;
+ goto next;
+ }
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Remap the page. */
+ struct page *page =
+ skb_shinfo(skb)->frags[0].page;
+ unsigned long pfn = page_to_pfn(page);
+ void *vaddr = page_address(page);
+
+ mcl = np->rx_mcl + pages_flipped;
+ mmu = np->rx_mmu + pages_flipped;
+
+ MULTI_update_va_mapping(mcl,
+ (unsigned long)vaddr,
+ pfn_pte_ma(mfn,
+ PAGE_KERNEL),
+ 0);
+ mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+ | MMU_MACHPHYS_UPDATE;
+ mmu->val = pfn;
+
+ set_phys_to_machine(pfn, mfn);
+ }
+ pages_flipped++;
+ } else {
+ ret = gnttab_end_foreign_access_ref(ref);
+ BUG_ON(!ret);
+ }
+
+ gnttab_release_grant_reference(&np->gref_rx_head, ref);
+
+ __skb_queue_tail(list, skb);
+
+next:
+ if (!(rx->flags & NETRXF_more_data))
+ break;
+
+ if (cons + frags == rp) {
+ if (net_ratelimit())
+ WPRINTK("Need more frags\n");
+ err = -ENOENT;
+ break;
+ }
+
+ rx = RING_GET_RESPONSE(&np->rx, cons + frags);
+ skb = xennet_get_rx_skb(np, cons + frags);
+ ref = xennet_get_rx_ref(np, cons + frags);
+ frags++;
+ }
+
+ if (unlikely(frags > max)) {
+ if (net_ratelimit())
+ WPRINTK("Too many frags\n");
+ err = -E2BIG;
+ }
+
+ if (unlikely(err))
+ np->rx.rsp_cons = cons + frags;
+
+ *pages_flipped_p = pages_flipped;
+
+ return err;
+}
+
+static RING_IDX xennet_fill_frags(struct netfront_info *np,
+ struct sk_buff *skb,
+ struct sk_buff_head *list)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int nr_frags = shinfo->nr_frags;
+ RING_IDX cons = np->rx.rsp_cons;
+ skb_frag_t *frag = shinfo->frags + nr_frags;
+ struct sk_buff *nskb;
+
+ while ((nskb = __skb_dequeue(list))) {
+ struct netif_rx_response *rx =
+ RING_GET_RESPONSE(&np->rx, ++cons);
+
+ frag->page = skb_shinfo(nskb)->frags[0].page;
+ frag->page_offset = rx->offset;
+ frag->size = rx->status;
+
+ skb->data_len += rx->status;
+
+ skb_shinfo(nskb)->nr_frags = 0;
+ kfree_skb(nskb);
+
+ frag++;
+ nr_frags++;
+ }
+
+ shinfo->nr_frags = nr_frags;
+ return cons;
+}
+
+static int xennet_set_skb_gso(struct sk_buff *skb,
+ struct netif_extra_info *gso)
+{
+ if (!gso->u.gso.size) {
+ if (net_ratelimit())
+ WPRINTK("GSO size must not be zero.\n");
+ return -EINVAL;
+ }
+
+ /* Currently only TCPv4 S.O. is supported. */
+ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+ if (net_ratelimit())
+ WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+ return -EINVAL;
+ }
+
+#if HAVE_TSO
+ skb_shinfo(skb)->gso_size = gso->u.gso.size;
+#if HAVE_GSO
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+#endif
+ skb_shinfo(skb)->gso_segs = 0;
+
+ return 0;
+#else
+ if (net_ratelimit())
+ WPRINTK("GSO unsupported by this kernel.\n");
+ return -EINVAL;
+#endif
+}
+
+static int netif_poll(struct napi_struct *napi, int budget)
+{
+ struct netfront_info *np = container_of(napi, struct netfront_info, napi);
+ struct net_device *dev = np->netdev;
+ struct sk_buff *skb;
+ struct netfront_rx_info rinfo;
+ struct netif_rx_response *rx = &rinfo.rx;
+ struct netif_extra_info *extras = rinfo.extras;
+ RING_IDX i, rp;
+ struct multicall_entry *mcl;
+ int work_done, more_to_do = 1, accel_more_to_do = 1;
+ struct sk_buff_head rxq;
+ struct sk_buff_head errq;
+ struct sk_buff_head tmpq;
+ unsigned long flags;
+ unsigned int len;
+ int pages_flipped = 0;
+ int err;
+
+ spin_lock(&np->rx_lock); /* no need for spin_lock_bh() in ->poll() */
+
+ if (unlikely(!netfront_carrier_ok(np))) {
+ spin_unlock(&np->rx_lock);
+ return 0;
+ }
+
+ skb_queue_head_init(&rxq);
+ skb_queue_head_init(&errq);
+ skb_queue_head_init(&tmpq);
+
+ rp = np->rx.sring->rsp_prod;
+ rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+ i = np->rx.rsp_cons;
+ work_done = 0;
+ while ((i != rp) && (work_done < budget)) {
+ memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
+ memset(extras, 0, sizeof(rinfo.extras));
+
+ err = xennet_get_responses(np, &rinfo, rp, &tmpq,
+ &pages_flipped);
+
+ if (unlikely(err)) {
+err:
+ while ((skb = __skb_dequeue(&tmpq)))
+ __skb_queue_tail(&errq, skb);
+ np->stats.rx_errors++;
+ i = np->rx.rsp_cons;
+ continue;
+ }
+
+ skb = __skb_dequeue(&tmpq);
+
+ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+ struct netif_extra_info *gso;
+ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+ if (unlikely(xennet_set_skb_gso(skb, gso))) {
+ __skb_queue_head(&tmpq, skb);
+ np->rx.rsp_cons += skb_queue_len(&tmpq);
+ goto err;
+ }
+ }
+
+ NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
+ NETFRONT_SKB_CB(skb)->offset = rx->offset;
+
+ len = rx->status;
+ if (len > RX_COPY_THRESHOLD)
+ len = RX_COPY_THRESHOLD;
+ skb_put(skb, len);
+
+ if (rx->status > len) {
+ skb_shinfo(skb)->frags[0].page_offset =
+ rx->offset + len;
+ skb_shinfo(skb)->frags[0].size = rx->status - len;
+ skb->data_len = rx->status - len;
+ } else {
+ skb_shinfo(skb)->frags[0].page = NULL;
+ skb_shinfo(skb)->nr_frags = 0;
+ }
+
+ i = xennet_fill_frags(np, skb, &tmpq);
+
+ /*
+ * Truesize must approximates the size of true data plus
+ * any supervisor overheads. Adding hypervisor overheads
+ * has been shown to significantly reduce achievable
+ * bandwidth with the default receive buffer size. It is
+ * therefore not wise to account for it here.
+ *
+ * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
+ * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
+ * add the size of the data pulled in xennet_fill_frags().
+ *
+ * We also adjust for any unused space in the main data
+ * area by subtracting (RX_COPY_THRESHOLD - len). This is
+ * especially important with drivers which split incoming
+ * packets into header and data, using only 66 bytes of
+ * the main data area (see the e1000 driver for example.)
+ * On such systems, without this last adjustement, our
+ * achievable receive throughout using the standard receive
+ * buffer size was cut by 25%(!!!).
+ */
+ skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
+ skb->len += skb->data_len;
+
+ /*
+ * Old backends do not assert data_validated but we
+ * can infer it from csum_blank so test both flags.
+ */
+ if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank))
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ else
+ skb->ip_summed = CHECKSUM_NONE;
+#ifdef CONFIG_XEN
+ skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE);
+ skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
+#endif
+ np->stats.rx_packets++;
+ np->stats.rx_bytes += skb->len;
+
+ __skb_queue_tail(&rxq, skb);
+
+ np->rx.rsp_cons = ++i;
+ work_done++;
+ }
+
+ if (pages_flipped) {
+ /* Some pages are no longer absent... */
+ balloon_update_driver_allowance(-pages_flipped);
+
+ /* Do all the remapping work and M2P updates. */
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ mcl = np->rx_mcl + pages_flipped;
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)np->rx_mmu;
+ mcl->args[1] = pages_flipped;
+ mcl->args[2] = 0;
+ mcl->args[3] = DOMID_SELF;
+ err = HYPERVISOR_multicall_check(np->rx_mcl,
+ pages_flipped + 1,
+ NULL);
+ BUG_ON(err);
+ }
+ }
+
+ __skb_queue_purge(&errq);
+
+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
+ struct page *page = NETFRONT_SKB_CB(skb)->page;
+ void *vaddr = page_address(page);
+ unsigned offset = NETFRONT_SKB_CB(skb)->offset;
+
+ memcpy(skb->data, vaddr + offset, skb_headlen(skb));
+
+ if (page != skb_shinfo(skb)->frags[0].page)
+ __free_page(page);
+
+ /* Ethernet work: Delayed to here as it peeks the header. */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* Pass it up. */
+ netif_receive_skb(skb);
+ dev->last_rx = jiffies;
+ }
+
+ /* If we get a callback with very few responses, reduce fill target. */
+ /* NB. Note exponential increase, linear decrease. */
+ if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
+ ((3*np->rx_target) / 4)) &&
+ (--np->rx_target < np->rx_min_target))
+ np->rx_target = np->rx_min_target;
+
+ network_alloc_rx_buffers(dev);
+
+ if (work_done < budget) {
+ /* there's some spare capacity, try the accelerated path */
+ int accel_budget = budget - work_done;
+ int accel_budget_start = accel_budget;
+
+ if (np->accel_vif_state.hooks) {
+ accel_more_to_do =
+ np->accel_vif_state.hooks->netdev_poll
+ (dev, &accel_budget);
+ work_done += (accel_budget_start - accel_budget);
+ } else
+ accel_more_to_do = 0;
+ }
+
+ if (work_done < budget) {
+ local_irq_save(flags);
+
+ RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
+
+ if (!more_to_do && !accel_more_to_do &&
+ np->accel_vif_state.hooks) {
+ /*
+ * Slow path has nothing more to do, see if
+ * fast path is likewise
+ */
+ accel_more_to_do =
+ np->accel_vif_state.hooks->start_napi_irq(dev);
+ }
+
+ if (!more_to_do && !accel_more_to_do)
+ __netif_rx_complete(dev, napi);
+
+ local_irq_restore(flags);
+ }
+
+ spin_unlock(&np->rx_lock);
+
+ return work_done;
+}
+
+static void netif_release_tx_bufs(struct netfront_info *np)
+{
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 1; i <= NET_TX_RING_SIZE; i++) {
+ if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
+ continue;
+
+ skb = np->tx_skbs[i];
+ gnttab_end_foreign_access_ref(np->grant_tx_ref[i]);
+ gnttab_release_grant_reference(
+ &np->gref_tx_head, np->grant_tx_ref[i]);
+ np->grant_tx_ref[i] = GRANT_INVALID_REF;
+ add_id_to_freelist(np->tx_skbs, i);
+ dev_kfree_skb_irq(skb);
+ }
+}
+
+static void netif_release_rx_bufs_flip(struct netfront_info *np)
+{
+ struct mmu_update *mmu = np->rx_mmu;
+ struct multicall_entry *mcl = np->rx_mcl;
+ struct sk_buff_head free_list;
+ struct sk_buff *skb;
+ unsigned long mfn;
+ int xfer = 0, noxfer = 0, unused = 0;
+ int id, ref, rc;
+
+ skb_queue_head_init(&free_list);
+
+ spin_lock_bh(&np->rx_lock);
+
+ for (id = 0; id < NET_RX_RING_SIZE; id++) {
+ if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
+ unused++;
+ continue;
+ }
+
+ skb = np->rx_skbs[id];
+ mfn = gnttab_end_foreign_transfer_ref(ref);
+ gnttab_release_grant_reference(&np->gref_rx_head, ref);
+ np->grant_rx_ref[id] = GRANT_INVALID_REF;
+ add_id_to_freelist(np->rx_skbs, id);
+
+ if (0 == mfn) {
+ struct page *page = skb_shinfo(skb)->frags[0].page;
+ balloon_release_driver_page(page);
+ skb_shinfo(skb)->nr_frags = 0;
+ dev_kfree_skb(skb);
+ noxfer++;
+ continue;
+ }
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Remap the page. */
+ struct page *page = skb_shinfo(skb)->frags[0].page;
+ unsigned long pfn = page_to_pfn(page);
+ void *vaddr = page_address(page);
+
+ MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
+ pfn_pte_ma(mfn, PAGE_KERNEL),
+ 0);
+ mcl++;
+ mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+ | MMU_MACHPHYS_UPDATE;
+ mmu->val = pfn;
+ mmu++;
+
+ set_phys_to_machine(pfn, mfn);
+ }
+ __skb_queue_tail(&free_list, skb);
+ xfer++;
+ }
+
+ DPRINTK("%s: %d xfer, %d noxfer, %d unused\n",
+ __FUNCTION__, xfer, noxfer, unused);
+
+ if (xfer) {
+ /* Some pages are no longer absent... */
+ balloon_update_driver_allowance(-xfer);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Do all the remapping work and M2P updates. */
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)np->rx_mmu;
+ mcl->args[1] = mmu - np->rx_mmu;
+ mcl->args[2] = 0;
+ mcl->args[3] = DOMID_SELF;
+ mcl++;
+ rc = HYPERVISOR_multicall_check(
+ np->rx_mcl, mcl - np->rx_mcl, NULL);
+ BUG_ON(rc);
+ }
+ }
+
+ __skb_queue_purge(&free_list);
+
+ spin_unlock_bh(&np->rx_lock);
+}
+
+static void netif_release_rx_bufs_copy(struct netfront_info *np)
+{
+ struct sk_buff *skb;
+ int i, ref;
+ int busy = 0, inuse = 0;
+
+ spin_lock_bh(&np->rx_lock);
+
+ for (i = 0; i < NET_RX_RING_SIZE; i++) {
+ ref = np->grant_rx_ref[i];
+
+ if (ref == GRANT_INVALID_REF)
+ continue;
+
+ inuse++;
+
+ skb = np->rx_skbs[i];
+
+ if (!gnttab_end_foreign_access_ref(ref))
+ {
+ busy++;
+ continue;
+ }
+
+ gnttab_release_grant_reference(&np->gref_rx_head, ref);
+ np->grant_rx_ref[i] = GRANT_INVALID_REF;
+ add_id_to_freelist(np->rx_skbs, i);
+
+ skb_shinfo(skb)->nr_frags = 0;
+ dev_kfree_skb(skb);
+ }
+
+ if (busy)
+ DPRINTK("%s: Unable to release %d of %d inuse grant references out of %ld total.\n",
+ __FUNCTION__, busy, inuse, NET_RX_RING_SIZE);
+
+ spin_unlock_bh(&np->rx_lock);
+}
+
+static int network_close(struct net_device *dev)
+{
+ struct netfront_info *np = netdev_priv(dev);
+ netif_stop_queue(np->netdev);
+ napi_disable(&np->napi);
+ return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+ struct netfront_info *np = netdev_priv(dev);
+
+ netfront_accelerator_call_get_stats(np, dev);
+ return &np->stats;
+}
+
+static int xennet_set_mac_address(struct net_device *dev, void *p)
+{
+ struct netfront_info *np = netdev_priv(dev);
+ struct sockaddr *addr = p;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ memcpy(np->mac, addr->sa_data, ETH_ALEN);
+
+ return 0;
+}
+
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+ int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+ if (mtu > max)
+ return -EINVAL;
+ dev->mtu = mtu;
+ return 0;
+}
+
+static int xennet_set_sg(struct net_device *dev, u32 data)
+{
+ if (data) {
+ struct netfront_info *np = netdev_priv(dev);
+ int val;
+
+ if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
+ "%d", &val) < 0)
+ val = 0;
+ if (!val)
+ return -ENOSYS;
+ } else if (dev->mtu > ETH_DATA_LEN)
+ dev->mtu = ETH_DATA_LEN;
+
+ return ethtool_op_set_sg(dev, data);
+}
+
+static int xennet_set_tso(struct net_device *dev, u32 data)
+{
+ if (data) {
+ struct netfront_info *np = netdev_priv(dev);
+ int val;
+
+ if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+ "feature-gso-tcpv4", "%d", &val) < 0)
+ val = 0;
+ if (!val)
+ return -ENOSYS;
+ }
+
+ return ethtool_op_set_tso(dev, data);
+}
+
+static void xennet_set_features(struct net_device *dev)
+{
+ dev_disable_gso_features(dev);
+ xennet_set_sg(dev, 0);
+
+ /* We need checksum offload to enable scatter/gather and TSO. */
+ if (!(dev->features & NETIF_F_IP_CSUM))
+ return;
+
+ if (xennet_set_sg(dev, 1))
+ return;
+
+ /* Before 2.6.9 TSO seems to be unreliable so do not enable it
+ * on older kernels.
+ */
+ if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9))
+ xennet_set_tso(dev, 1);
+}
+
+static int network_connect(struct net_device *dev)
+{
+ struct netfront_info *np = netdev_priv(dev);
+ int i, requeue_idx, err;
+ struct sk_buff *skb;
+ grant_ref_t ref;
+ netif_rx_request_t *req;
+ unsigned int feature_rx_copy, feature_rx_flip;
+
+ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+ "feature-rx-copy", "%u", &feature_rx_copy);
+ if (err != 1)
+ feature_rx_copy = 0;
+ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+ "feature-rx-flip", "%u", &feature_rx_flip);
+ if (err != 1)
+ feature_rx_flip = 1;
+
+ /*
+ * Copy packets on receive path if:
+ * (a) This was requested by user, and the backend supports it; or
+ * (b) Flipping was requested, but this is unsupported by the backend.
+ */
+ np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
+ (MODPARM_rx_flip && !feature_rx_flip));
+
+ err = talk_to_backend(np->xbdev, np);
+ if (err)
+ return err;
+
+ xennet_set_features(dev);
+
+ DPRINTK("device %s has %sing receive path.\n",
+ dev->name, np->copying_receiver ? "copy" : "flipp");
+
+ spin_lock_bh(&np->rx_lock);
+ spin_lock_irq(&np->tx_lock);
+
+ /*
+ * Recovery procedure:
+ * NB. Freelist index entries are always going to be less than
+ * PAGE_OFFSET, whereas pointers to skbs will always be equal or
+ * greater than PAGE_OFFSET: we use this property to distinguish
+ * them.
+ */
+
+ /* Step 1: Discard all pending TX packet fragments. */
+ netif_release_tx_bufs(np);
+
+ /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
+ for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
+ if (!np->rx_skbs[i])
+ continue;
+
+ skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
+ ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
+ req = RING_GET_REQUEST(&np->rx, requeue_idx);
+
+ if (!np->copying_receiver) {
+ gnttab_grant_foreign_transfer_ref(
+ ref, np->xbdev->otherend_id,
+ page_to_pfn(skb_shinfo(skb)->frags->page));
+ } else {
+ gnttab_grant_foreign_access_ref(
+ ref, np->xbdev->otherend_id,
+ pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
+ frags->page)),
+ 0);
+ }
+ req->gref = ref;
+ req->id = requeue_idx;
+
+ requeue_idx++;
+ }
+
+ np->rx.req_prod_pvt = requeue_idx;
+
+ /*
+ * Step 3: All public and private state should now be sane. Get
+ * ready to start sending and receiving packets and give the driver
+ * domain a kick because we've probably just requeued some
+ * packets.
+ */
+ netfront_carrier_on(np);
+ notify_remote_via_irq(np->irq);
+ network_tx_buf_gc(dev);
+ network_alloc_rx_buffers(dev);
+
+ spin_unlock_irq(&np->tx_lock);
+ spin_unlock_bh(&np->rx_lock);
+
+ return 0;
+}
+
+static void netif_uninit(struct net_device *dev)
+{
+ struct netfront_info *np = netdev_priv(dev);
+ netif_release_tx_bufs(np);
+ if (np->copying_receiver)
+ netif_release_rx_bufs_copy(np);
+ else
+ netif_release_rx_bufs_flip(np);
+ gnttab_free_grant_references(np->gref_tx_head);
+ gnttab_free_grant_references(np->gref_rx_head);
+}
+
+static struct ethtool_ops network_ethtool_ops =
+{
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = ethtool_op_set_tx_csum,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = xennet_set_sg,
+#if HAVE_TSO
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = xennet_set_tso,
+#endif
+ .get_link = ethtool_op_get_link,
+};
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_rxbuf_min(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+ return sprintf(buf, "%u\n", info->rx_min_target);
+}
+
+static ssize_t store_rxbuf_min(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct netfront_info *np = netdev_priv(netdev);
+ char *endp;
+ unsigned long target;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ target = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EBADMSG;
+
+ if (target < RX_MIN_TARGET)
+ target = RX_MIN_TARGET;
+ if (target > RX_MAX_TARGET)
+ target = RX_MAX_TARGET;
+
+ spin_lock_bh(&np->rx_lock);
+ if (target > np->rx_max_target)
+ np->rx_max_target = target;
+ np->rx_min_target = target;
+ if (target > np->rx_target)
+ np->rx_target = target;
+
+ network_alloc_rx_buffers(netdev);
+
+ spin_unlock_bh(&np->rx_lock);
+ return len;
+}
+
+static ssize_t show_rxbuf_max(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+ return sprintf(buf, "%u\n", info->rx_max_target);
+}
+
+static ssize_t store_rxbuf_max(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct netfront_info *np = netdev_priv(netdev);
+ char *endp;
+ unsigned long target;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ target = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EBADMSG;
+
+ if (target < RX_MIN_TARGET)
+ target = RX_MIN_TARGET;
+ if (target > RX_MAX_TARGET)
+ target = RX_MAX_TARGET;
+
+ spin_lock_bh(&np->rx_lock);
+ if (target < np->rx_min_target)
+ np->rx_min_target = target;
+ np->rx_max_target = target;
+ if (target < np->rx_target)
+ np->rx_target = target;
+
+ network_alloc_rx_buffers(netdev);
+
+ spin_unlock_bh(&np->rx_lock);
+ return len;
+}
+
+static ssize_t show_rxbuf_cur(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+ return sprintf(buf, "%u\n", info->rx_target);
+}
+
+static struct device_attribute xennet_attrs[] = {
+ __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
+ __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
+ __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
+};
+
+static int xennet_sysfs_addif(struct net_device *netdev)
+{
+ int i;
+ int error = 0;
+
+ for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
+ error = device_create_file(&netdev->dev,
+ &xennet_attrs[i]);
+ if (error)
+ goto fail;
+ }
+ return 0;
+
+ fail:
+ while (--i >= 0)
+ device_remove_file(&netdev->dev, &xennet_attrs[i]);
+ return error;
+}
+
+static void xennet_sysfs_delif(struct net_device *netdev)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
+ device_remove_file(&netdev->dev, &xennet_attrs[i]);
+}
+
+#endif /* CONFIG_SYSFS */
+
+
+/*
+ * Nothing to do here. Virtual interface is point-to-point and the
+ * physical interface is probably promiscuous anyway.
+ */
+static void network_set_multicast_list(struct net_device *dev)
+{
+}
+
+static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
+{
+ int i, err = 0;
+ struct net_device *netdev = NULL;
+ struct netfront_info *np = NULL;
+
+ netdev = alloc_etherdev(sizeof(struct netfront_info));
+ if (!netdev) {
+ printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
+ __FUNCTION__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ np = netdev_priv(netdev);
+ np->xbdev = dev;
+
+ spin_lock_init(&np->tx_lock);
+ spin_lock_init(&np->rx_lock);
+
+ init_accelerator_vif(np, dev);
+
+ skb_queue_head_init(&np->rx_batch);
+ np->rx_target = RX_DFL_MIN_TARGET;
+ np->rx_min_target = RX_DFL_MIN_TARGET;
+ np->rx_max_target = RX_MAX_TARGET;
+
+ init_timer(&np->rx_refill_timer);
+ np->rx_refill_timer.data = (unsigned long)netdev;
+ np->rx_refill_timer.function = rx_refill_timeout;
+
+ /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
+ for (i = 0; i <= NET_TX_RING_SIZE; i++) {
+ np->tx_skbs[i] = (void *)((unsigned long) i+1);
+ np->grant_tx_ref[i] = GRANT_INVALID_REF;
+ }
+
+ for (i = 0; i < NET_RX_RING_SIZE; i++) {
+ np->rx_skbs[i] = NULL;
+ np->grant_rx_ref[i] = GRANT_INVALID_REF;
+ }
+
+ /* A grant for every tx ring slot */
+ if (gnttab_alloc_grant_references(TX_MAX_TARGET,
+ &np->gref_tx_head) < 0) {
+ printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
+ err = -ENOMEM;
+ goto exit;
+ }
+ /* A grant for every rx ring slot */
+ if (gnttab_alloc_grant_references(RX_MAX_TARGET,
+ &np->gref_rx_head) < 0) {
+ printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
+ err = -ENOMEM;
+ goto exit_free_tx;
+ }
+
+ netdev->open = network_open;
+ netdev->hard_start_xmit = network_start_xmit;
+ netdev->stop = network_close;
+ netdev->get_stats = network_get_stats;
+ netif_napi_add(netdev, &np->napi, netif_poll, 64);
+ netdev->set_multicast_list = network_set_multicast_list;
+ netdev->uninit = netif_uninit;
+ netdev->set_mac_address = xennet_set_mac_address;
+ netdev->change_mtu = xennet_change_mtu;
+ netdev->features = NETIF_F_IP_CSUM;
+
+ SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
+ SET_NETDEV_DEV(netdev, &dev->dev);
+
+ np->netdev = netdev;
+
+ netfront_carrier_off(np);
+
+ return netdev;
+
+ exit_free_tx:
+ gnttab_free_grant_references(np->gref_tx_head);
+ exit:
+ free_netdev(netdev);
+ return ERR_PTR(err);
+}
+
+#ifdef CONFIG_INET
+/*
+ * We use this notifier to send out a fake ARP reply to reset switches and
+ * router ARP caches when an IP interface is brought up on a VIF.
+ */
+static int
+inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ /* UP event and is it one of our devices? */
+ if (event == NETDEV_UP && dev->open == network_open)
+ send_fake_arp(dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block notifier_inetdev = {
+ .notifier_call = inetdev_notify,
+ .next = NULL,
+ .priority = 0
+};
+#endif
+
+
+static void netif_disconnect_backend(struct netfront_info *info)
+{
+ /* Stop old i/f to prevent errors whilst we rebuild the state. */
+ spin_lock_bh(&info->rx_lock);
+ spin_lock_irq(&info->tx_lock);
+ netfront_carrier_off(info);
+ spin_unlock_irq(&info->tx_lock);
+ spin_unlock_bh(&info->rx_lock);
+
+ if (info->irq)
+ unbind_from_irqhandler(info->irq, info->netdev);
+ info->irq = 0;
+
+ end_access(info->tx_ring_ref, info->tx.sring);
+ end_access(info->rx_ring_ref, info->rx.sring);
+ info->tx_ring_ref = GRANT_INVALID_REF;
+ info->rx_ring_ref = GRANT_INVALID_REF;
+ info->tx.sring = NULL;
+ info->rx.sring = NULL;
+}
+
+
+static void end_access(int ref, void *page)
+{
+ if (ref != GRANT_INVALID_REF)
+ gnttab_end_foreign_access(ref, (unsigned long)page);
+}
+
+
+/* ** Driver registration ** */
+
+
+static const struct xenbus_device_id netfront_ids[] = {
+ { "vif" },
+ { "" }
+};
+MODULE_ALIAS("xen:vif");
+
+
+static struct xenbus_driver netfront_driver = {
+ .name = "vif",
+ .ids = netfront_ids,
+ .probe = netfront_probe,
+ .remove = __devexit_p(netfront_remove),
+ .suspend = netfront_suspend,
+ .suspend_cancel = netfront_suspend_cancel,
+ .resume = netfront_resume,
+ .otherend_changed = backend_changed,
+};
+
+
+static int __init netif_init(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+#ifdef CONFIG_XEN
+ if (MODPARM_rx_flip && MODPARM_rx_copy) {
+ WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
+ return -EINVAL;
+ }
+
+ if (!MODPARM_rx_flip && !MODPARM_rx_copy)
+ MODPARM_rx_flip = 1; /* Default is to flip. */
+#endif
+
+ netif_init_accel();
+
+ IPRINTK("Initialising virtual ethernet driver.\n");
+
+#ifdef CONFIG_INET
+ (void)register_inetaddr_notifier(¬ifier_inetdev);
+#endif
+
+ return xenbus_register_frontend(&netfront_driver);
+}
+module_init(netif_init);
+
+
+static void __exit netif_exit(void)
+{
+#ifdef CONFIG_INET
+ unregister_inetaddr_notifier(¬ifier_inetdev);
+#endif
+
+ netif_exit_accel();
+
+ return xenbus_unregister_driver(&netfront_driver);
+}
+module_exit(netif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef NETFRONT_H
+#define NETFRONT_H
+
+#include <xen/interface/io/netif.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+
+#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
+#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
+
+#include <xen/xenbus.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+/*
+ * Function pointer table for hooks into a network acceleration
+ * plugin. These are called at appropriate points from the netfront
+ * driver
+ */
+struct netfront_accel_hooks {
+ /*
+ * new_device: Accelerator hook to ask the plugin to support a
+ * new network interface
+ */
+ int (*new_device)(struct net_device *net_dev, struct xenbus_device *dev);
+ /*
+ * remove: Opposite of new_device
+ */
+ int (*remove)(struct xenbus_device *dev);
+ /*
+ * The net_device is being polled, check the accelerated
+ * hardware for any pending packets
+ */
+ int (*netdev_poll)(struct net_device *dev, int *pbudget);
+ /*
+ * start_xmit: Used to give the accelerated plugin the option
+ * of sending a packet. Returns non-zero if has done so, or
+ * zero to decline and force the packet onto normal send
+ * path
+ */
+ int (*start_xmit)(struct sk_buff *skb, struct net_device *dev);
+ /*
+ * start/stop_napi_interrupts Used by netfront to indicate
+ * when napi interrupts should be enabled or disabled
+ */
+ int (*start_napi_irq)(struct net_device *dev);
+ void (*stop_napi_irq)(struct net_device *dev);
+ /*
+ * Called before re-enabling the TX queue to check the fast
+ * path has slots too
+ */
+ int (*check_ready)(struct net_device *dev);
+ /*
+ * Get the fastpath network statistics
+ */
+ int (*get_stats)(struct net_device *dev,
+ struct net_device_stats *stats);
+};
+
+
+/* Version of API/protocol for communication between netfront and
+ acceleration plugin supported */
+#define NETFRONT_ACCEL_VERSION 0x00010003
+
+/*
+ * Per-netfront device state for the accelerator. This is used to
+ * allow efficient per-netfront device access to the accelerator
+ * hooks
+ */
+struct netfront_accel_vif_state {
+ struct list_head link;
+
+ struct xenbus_device *dev;
+ struct netfront_info *np;
+ struct netfront_accel_hooks *hooks;
+
+ /* Watch on the accelerator configuration value */
+ struct xenbus_watch accel_watch;
+ /* Work item to process change in accelerator */
+ struct work_struct accel_work;
+ /* The string from xenbus last time accel_watch fired */
+ char *accel_frontend;
+};
+
+/*
+ * Per-accelerator state stored in netfront. These form a list that
+ * is used to track which devices are accelerated by which plugins,
+ * and what plugins are available/have been requested
+ */
+struct netfront_accelerator {
+ /* Used to make a list */
+ struct list_head link;
+ /* ID of the accelerator */
+ int id;
+ /*
+ * String describing the accelerator. Currently this is the
+ * name of the accelerator module. This is provided by the
+ * backend accelerator through xenstore
+ */
+ char *frontend;
+ /* The hooks into the accelerator plugin module */
+ struct netfront_accel_hooks *hooks;
+
+ /*
+ * List of per-netfront device state (struct
+ * netfront_accel_vif_state) for each netfront device that is
+ * using this accelerator
+ */
+ struct list_head vif_states;
+ spinlock_t vif_states_lock;
+};
+
+struct netfront_info {
+ struct list_head list;
+ struct net_device *netdev;
+
+ struct net_device_stats stats;
+
+ struct netif_tx_front_ring tx;
+ struct netif_rx_front_ring rx;
+
+ spinlock_t tx_lock;
+ spinlock_t rx_lock;
+
+ struct napi_struct napi;
+
+ unsigned int irq;
+ unsigned int copying_receiver;
+ unsigned int carrier;
+
+ /* Receive-ring batched refills. */
+#define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
+#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+ unsigned rx_min_target, rx_max_target, rx_target;
+ struct sk_buff_head rx_batch;
+
+ struct timer_list rx_refill_timer;
+
+ /*
+ * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
+ * is an index into a chain of free entries.
+ */
+ struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
+ struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
+
+#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+ grant_ref_t gref_tx_head;
+ grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
+ grant_ref_t gref_rx_head;
+ grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
+
+ struct xenbus_device *xbdev;
+ int tx_ring_ref;
+ int rx_ring_ref;
+ u8 mac[ETH_ALEN];
+
+ unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+ struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
+ struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+
+ /* Private pointer to state internal to accelerator module */
+ void *accel_priv;
+ /* The accelerator used by this netfront device */
+ struct netfront_accelerator *accelerator;
+ /* The accelerator state for this netfront device */
+ struct netfront_accel_vif_state accel_vif_state;
+};
+
+
+/* Exported Functions */
+
+/*
+ * Called by an accelerator plugin module when it has loaded.
+ *
+ * frontend: the string describing the accelerator, currently the module name
+ * hooks: the hooks for netfront to use to call into the accelerator
+ * version: the version of API between frontend and plugin requested
+ *
+ * return: 0 on success, <0 on error, >0 (with version supported) on
+ * version mismatch
+ */
+extern int netfront_accelerator_loaded(int version, const char *frontend,
+ struct netfront_accel_hooks *hooks);
+
+/*
+ * Called by an accelerator plugin module when it is about to unload.
+ *
+ * frontend: the string describing the accelerator. Must match the
+ * one passed to netfront_accelerator_loaded()
+ */
+extern void netfront_accelerator_stop(const char *frontend);
+
+/*
+ * Called by an accelerator before waking the net device's TX queue to
+ * ensure the slow path has available slots. Returns true if OK to
+ * wake, false if still busy
+ */
+extern int netfront_check_queue_ready(struct net_device *net_dev);
+
+
+/* Internal-to-netfront Functions */
+
+/*
+ * Call into accelerator and check to see if it has tx space before we
+ * wake the net device's TX queue. Returns true if OK to wake, false
+ * if still busy
+ */
+extern
+int netfront_check_accelerator_queue_ready(struct net_device *dev,
+ struct netfront_info *np);
+extern
+int netfront_accelerator_call_remove(struct netfront_info *np,
+ struct xenbus_device *dev);
+extern
+int netfront_accelerator_suspend(struct netfront_info *np,
+ struct xenbus_device *dev);
+extern
+int netfront_accelerator_suspend_cancel(struct netfront_info *np,
+ struct xenbus_device *dev);
+extern
+void netfront_accelerator_resume(struct netfront_info *np,
+ struct xenbus_device *dev);
+extern
+void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
+ struct net_device *dev);
+extern
+int netfront_accelerator_call_get_stats(struct netfront_info *np,
+ struct net_device *dev);
+extern
+void netfront_accelerator_add_watch(struct netfront_info *np);
+
+extern
+void netif_init_accel(void);
+extern
+void netif_exit_accel(void);
+
+extern
+void init_accelerator_vif(struct netfront_info *np,
+ struct xenbus_device *dev);
+#endif /* NETFRONT_H */
--- /dev/null
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
+
+pciback-y := pci_stub.o pciback_ops.o xenbus.o
+pciback-y += conf_space.o conf_space_header.o \
+ conf_space_capability.o \
+ conf_space_capability_vpd.o \
+ conf_space_capability_pm.o \
+ conf_space_quirks.o
+pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
+pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
+pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
+pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
+
+ccflags-$(CONFIG_XEN_PCIDEV_BE_DEBUG) += -DDEBUG
--- /dev/null
+/*
+ * PCI Backend - Functions for creating a virtual configuration space for
+ * exported PCI Devices.
+ * It's dangerous to allow PCI Driver Domains to change their
+ * device's resources (memory, i/o ports, interrupts). We need to
+ * restrict changes to certain PCI Configuration registers:
+ * BARs, INTERRUPT_PIN, most registers in the header...
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+#define DEFINE_PCI_CONFIG(op,size,type) \
+int pciback_##op##_config_##size \
+(struct pci_dev *dev, int offset, type value, void *data) \
+{ \
+ return pci_##op##_config_##size (dev, offset, value); \
+}
+
+DEFINE_PCI_CONFIG(read, byte, u8 *)
+DEFINE_PCI_CONFIG(read, word, u16 *)
+DEFINE_PCI_CONFIG(read, dword, u32 *)
+
+DEFINE_PCI_CONFIG(write, byte, u8)
+DEFINE_PCI_CONFIG(write, word, u16)
+DEFINE_PCI_CONFIG(write, dword, u32)
+
+static int conf_space_read(struct pci_dev *dev,
+ struct config_field_entry *entry, int offset,
+ u32 * value)
+{
+ int ret = 0;
+ struct config_field *field = entry->field;
+
+ *value = 0;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.read)
+ ret = field->u.b.read(dev, offset, (u8 *) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.read)
+ ret = field->u.w.read(dev, offset, (u16 *) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.read)
+ ret = field->u.dw.read(dev, offset, value, entry->data);
+ break;
+ }
+ return ret;
+}
+
+static int conf_space_write(struct pci_dev *dev,
+ struct config_field_entry *entry, int offset,
+ u32 value)
+{
+ int ret = 0;
+ struct config_field *field = entry->field;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.write)
+ ret = field->u.b.write(dev, offset, (u8) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.write)
+ ret = field->u.w.write(dev, offset, (u16) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.write)
+ ret = field->u.dw.write(dev, offset, value,
+ entry->data);
+ break;
+ }
+ return ret;
+}
+
+static inline u32 get_mask(int size)
+{
+ if (size == 1)
+ return 0xff;
+ else if (size == 2)
+ return 0xffff;
+ else
+ return 0xffffffff;
+}
+
+static inline int valid_request(int offset, int size)
+{
+ /* Validate request (no un-aligned requests) */
+ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
+ return 1;
+ return 0;
+}
+
+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
+ int offset)
+{
+ if (offset >= 0) {
+ new_val_mask <<= (offset * 8);
+ new_val <<= (offset * 8);
+ } else {
+ new_val_mask >>= (offset * -8);
+ new_val >>= (offset * -8);
+ }
+ val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
+ return val;
+}
+
+static int pcibios_err_to_errno(int err)
+{
+ switch (err) {
+ case PCIBIOS_SUCCESSFUL:
+ return XEN_PCI_ERR_success;
+ case PCIBIOS_DEVICE_NOT_FOUND:
+ return XEN_PCI_ERR_dev_not_found;
+ case PCIBIOS_BAD_REGISTER_NUMBER:
+ return XEN_PCI_ERR_invalid_offset;
+ case PCIBIOS_FUNC_NOT_SUPPORTED:
+ return XEN_PCI_ERR_not_implemented;
+ case PCIBIOS_SET_FAILED:
+ return XEN_PCI_ERR_access_denied;
+ }
+ return err;
+}
+
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+ u32 * ret_val)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+ struct config_field *field;
+ int req_start, req_end, field_start, field_end;
+ /* if read fails for any reason, return 0 (as if device didn't respond) */
+ u32 value = 0, tmp_val;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
+ pci_name(dev), size, offset);
+
+ if (!valid_request(offset, size)) {
+ err = XEN_PCI_ERR_invalid_offset;
+ goto out;
+ }
+
+ /* Get the real value first, then modify as appropriate */
+ switch (size) {
+ case 1:
+ err = pci_read_config_byte(dev, offset, (u8 *) & value);
+ break;
+ case 2:
+ err = pci_read_config_word(dev, offset, (u16 *) & value);
+ break;
+ case 4:
+ err = pci_read_config_dword(dev, offset, &value);
+ break;
+ }
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ err = conf_space_read(dev, cfg_entry, field_start,
+ &tmp_val);
+ if (err)
+ goto out;
+
+ value = merge_value(value, tmp_val,
+ get_mask(field->size),
+ field_start - req_start);
+ }
+ }
+
+ out:
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ *ret_val = value;
+ return pcibios_err_to_errno(err);
+}
+
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
+{
+ int err = 0, handled = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+ struct config_field *field;
+ u32 tmp_val;
+ int req_start, req_end, field_start, field_end;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ "pciback: %s: write request %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ if (!valid_request(offset, size))
+ return XEN_PCI_ERR_invalid_offset;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ tmp_val = 0;
+
+ err = pciback_config_read(dev, field_start,
+ field->size, &tmp_val);
+ if (err)
+ break;
+
+ tmp_val = merge_value(tmp_val, value, get_mask(size),
+ req_start - field_start);
+
+ err = conf_space_write(dev, cfg_entry, field_start,
+ tmp_val);
+
+ /* handled is set true here, but not every byte
+ * may have been written! Properly detecting if
+ * every byte is handled is unnecessary as the
+ * flag is used to detect devices that need
+ * special helpers to work correctly.
+ */
+ handled = 1;
+ }
+ }
+
+ if (!handled && !err) {
+ /* By default, anything not specificially handled above is
+ * read-only. The permissive flag changes this behavior so
+ * that anything not specifically handled above is writable.
+ * This means that some fields may still be read-only because
+ * they have entries in the config_field list that intercept
+ * the write and do nothing. */
+ if (dev_data->permissive) {
+ switch (size) {
+ case 1:
+ err = pci_write_config_byte(dev, offset,
+ (u8) value);
+ break;
+ case 2:
+ err = pci_write_config_word(dev, offset,
+ (u16) value);
+ break;
+ case 4:
+ err = pci_write_config_dword(dev, offset,
+ (u32) value);
+ break;
+ }
+ } else if (!dev_data->warned_on_write) {
+ dev_data->warned_on_write = 1;
+ dev_warn(&dev->dev, "Driver tried to write to a "
+ "read-only configuration space field at offset "
+ "0x%x, size %d. This may be harmless, but if "
+ "you have problems with your device:\n"
+ "1) see permissive attribute in sysfs\n"
+ "2) report problems to the xen-devel "
+ "mailing list along with details of your "
+ "device obtained from lspci.\n", offset, size);
+ }
+ }
+
+ return pcibios_err_to_errno(err);
+}
+
+void pciback_config_free_dyn_fields(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ struct config_field *field;
+
+ dev_dbg(&dev->dev,
+ "free-ing dynamically allocated virtual configuration space fields\n");
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->clean) {
+ field->clean(field);
+
+ if (cfg_entry->data)
+ kfree(cfg_entry->data);
+
+ list_del(&cfg_entry->list);
+ kfree(cfg_entry);
+ }
+
+ }
+}
+
+void pciback_config_reset_dev(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+ struct config_field *field;
+
+ dev_dbg(&dev->dev, "resetting virtual configuration space\n");
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->reset)
+ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
+ }
+}
+
+void pciback_config_free_dev(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ struct config_field *field;
+
+ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ list_del(&cfg_entry->list);
+
+ field = cfg_entry->field;
+
+ if (field->release)
+ field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
+ kfree(cfg_entry);
+ }
+}
+
+int pciback_config_add_field_offset(struct pci_dev *dev,
+ struct config_field *field,
+ unsigned int base_offset)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+ void *tmp;
+
+ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
+ if (!cfg_entry) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cfg_entry->data = NULL;
+ cfg_entry->field = field;
+ cfg_entry->base_offset = base_offset;
+
+ /* silently ignore duplicate fields */
+ err = pciback_field_is_dup(dev,OFFSET(cfg_entry));
+ if (err)
+ goto out;
+
+ if (field->init) {
+ tmp = field->init(dev, OFFSET(cfg_entry));
+
+ if (IS_ERR(tmp)) {
+ err = PTR_ERR(tmp);
+ goto out;
+ }
+
+ cfg_entry->data = tmp;
+ }
+
+ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
+ OFFSET(cfg_entry));
+ list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
+ out:
+ if (err)
+ kfree(cfg_entry);
+
+ return err;
+}
+
+/* This sets up the device's virtual configuration space to keep track of
+ * certain registers (like the base address registers (BARs) so that we can
+ * keep the client from manipulating them directly.
+ */
+int pciback_config_init_dev(struct pci_dev *dev)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
+ dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
+ INIT_LIST_HEAD(&dev_data->config_fields);
+
+ err = pciback_config_header_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = pciback_config_capability_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = pciback_config_quirks_init(dev);
+
+ out:
+ return err;
+}
+
+int pciback_config_init(void)
+{
+ return pciback_config_capability_init();
+}
--- /dev/null
+/*
+ * PCI Backend - Common data structures for overriding the configuration space
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
+#define __XEN_PCIBACK_CONF_SPACE_H__
+
+#include <linux/list.h>
+#include <linux/err.h>
+
+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
+typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
+typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
+typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
+
+typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
+ void *data);
+typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
+ void *data);
+typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
+ void *data);
+typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
+ void *data);
+typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
+ void *data);
+typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
+ void *data);
+
+/* These are the fields within the configuration space which we
+ * are interested in intercepting reads/writes to and changing their
+ * values.
+ */
+struct config_field {
+ unsigned int offset;
+ unsigned int size;
+ unsigned int mask;
+ conf_field_init init;
+ conf_field_reset reset;
+ conf_field_free release;
+ void (*clean) (struct config_field * field);
+ union {
+ struct {
+ conf_dword_write write;
+ conf_dword_read read;
+ } dw;
+ struct {
+ conf_word_write write;
+ conf_word_read read;
+ } w;
+ struct {
+ conf_byte_write write;
+ conf_byte_read read;
+ } b;
+ } u;
+ struct list_head list;
+};
+
+struct config_field_entry {
+ struct list_head list;
+ struct config_field *field;
+ unsigned int base_offset;
+ void *data;
+};
+
+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
+/* Add fields to a device - the add_fields macro expects to get a pointer to
+ * the first entry in an array (of which the ending is marked by size==0)
+ */
+int pciback_config_add_field_offset(struct pci_dev *dev,
+ struct config_field *field,
+ unsigned int offset);
+
+static inline int pciback_config_add_field(struct pci_dev *dev,
+ struct config_field *field)
+{
+ return pciback_config_add_field_offset(dev, field, 0);
+}
+
+static inline int pciback_config_add_fields(struct pci_dev *dev,
+ struct config_field *field)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = pciback_config_add_field(dev, &field[i]);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
+ struct config_field *field,
+ unsigned int offset)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = pciback_config_add_field_offset(dev, &field[i], offset);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/* Read/Write the real configuration space */
+int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
+ void *data);
+int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
+ void *data);
+int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
+ void *data);
+int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
+ void *data);
+int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
+ void *data);
+int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
+ void *data);
+
+int pciback_config_capability_init(void);
+
+int pciback_config_header_add_fields(struct pci_dev *dev);
+int pciback_config_capability_add_fields(struct pci_dev *dev);
+
+#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */
--- /dev/null
+/*
+ * PCI Backend - Handles the virtual fields found on the capability lists
+ * in the configuration space.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static LIST_HEAD(capabilities);
+
+static struct config_field caplist_header[] = {
+ {
+ .offset = PCI_CAP_LIST_ID,
+ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = NULL,
+ },
+ {
+ .size = 0,
+ },
+};
+
+static inline void register_capability(struct pciback_config_capability *cap)
+{
+ list_add_tail(&cap->cap_list, &capabilities);
+}
+
+int pciback_config_capability_add_fields(struct pci_dev *dev)
+{
+ int err = 0;
+ struct pciback_config_capability *cap;
+ int cap_offset;
+
+ list_for_each_entry(cap, &capabilities, cap_list) {
+ cap_offset = pci_find_capability(dev, cap->capability);
+ if (cap_offset) {
+ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
+ cap->capability, cap_offset);
+
+ err = pciback_config_add_fields_offset(dev,
+ caplist_header,
+ cap_offset);
+ if (err)
+ goto out;
+ err = pciback_config_add_fields_offset(dev,
+ cap->fields,
+ cap_offset);
+ if (err)
+ goto out;
+ }
+ }
+
+ out:
+ return err;
+}
+
+extern struct pciback_config_capability pciback_config_capability_vpd;
+extern struct pciback_config_capability pciback_config_capability_pm;
+
+int pciback_config_capability_init(void)
+{
+ register_capability(&pciback_config_capability_vpd);
+ register_capability(&pciback_config_capability_pm);
+
+ return 0;
+}
--- /dev/null
+/*
+ * PCI Backend - Data structures for special overlays for structures on
+ * the capability list.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
+#define __PCIBACK_CONFIG_CAPABILITY_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_capability {
+ struct list_head cap_list;
+
+ int capability;
+
+ /* If the device has the capability found above, add these fields */
+ struct config_field *fields;
+};
+
+#endif
--- /dev/null
+/*
+ * PCI Backend - Configuration space overlay for power management
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
+ void *data)
+{
+ int err;
+ u16 real_value;
+
+ err = pci_read_config_word(dev, offset, &real_value);
+ if (err)
+ goto out;
+
+ *value = real_value & ~PCI_PM_CAP_PME_MASK;
+
+ out:
+ return err;
+}
+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
+ * Can't allow driver domain to enable PMEs - they're shared */
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
+
+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
+ void *data)
+{
+ int err;
+ u16 old_value;
+ pci_power_t new_state, old_state;
+
+ err = pci_read_config_word(dev, offset, &old_value);
+ if (err)
+ goto out;
+
+ old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
+ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
+
+ new_value &= PM_OK_BITS;
+ if ((old_value & PM_OK_BITS) != new_value) {
+ new_value = (old_value & ~PM_OK_BITS) | new_value;
+ err = pci_write_config_word(dev, offset, new_value);
+ if (err)
+ goto out;
+ }
+
+ /* Let pci core handle the power management change */
+ dev_dbg(&dev->dev, "set power state to %x\n", new_state);
+ err = pci_set_power_state(dev, new_state);
+ if (err) {
+ err = PCIBIOS_SET_FAILED;
+ goto out;
+ }
+
+ /*
+ * Device may lose PCI config info on D3->D0 transition. This
+ * is a problem for some guests which will not reset BARs. Even
+ * those that have a go will be foiled by our BAR-write handler
+ * which will discard the write! Since Linux won't re-init
+ * the config space automatically in all cases, we do it here.
+ * Future: Should we re-initialise all first 64 bytes of config space?
+ */
+ if (new_state == PCI_D0 &&
+ (old_state == PCI_D3hot || old_state == PCI_D3cold) &&
+ !(old_value & PCI_PM_CTRL_NO_SOFT_RESET))
+ pci_restore_bars(dev);
+
+ out:
+ return err;
+}
+
+/* Ensure PMEs are disabled */
+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
+ int err;
+ u16 value;
+
+ err = pci_read_config_word(dev, offset, &value);
+ if (err)
+ goto out;
+
+ if (value & PCI_PM_CTRL_PME_ENABLE) {
+ value &= ~PCI_PM_CTRL_PME_ENABLE;
+ err = pci_write_config_word(dev, offset, value);
+ }
+
+ out:
+ return ERR_PTR(err);
+}
+
+static struct config_field caplist_pm[] = {
+ {
+ .offset = PCI_PM_PMC,
+ .size = 2,
+ .u.w.read = pm_caps_read,
+ },
+ {
+ .offset = PCI_PM_CTRL,
+ .size = 2,
+ .init = pm_ctrl_init,
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = pm_ctrl_write,
+ },
+ {
+ .offset = PCI_PM_PPB_EXTENSIONS,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ .offset = PCI_PM_DATA_REGISTER,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ .size = 0,
+ },
+};
+
+struct pciback_config_capability pciback_config_capability_pm = {
+ .capability = PCI_CAP_ID_PM,
+ .fields = caplist_pm,
+};
--- /dev/null
+/*
+ * PCI Backend - Configuration space overlay for Vital Product Data
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
+ void *data)
+{
+ /* Disallow writes to the vital product data */
+ if (value & PCI_VPD_ADDR_F)
+ return PCIBIOS_SET_FAILED;
+ else
+ return pci_write_config_word(dev, offset, value);
+}
+
+static struct config_field caplist_vpd[] = {
+ {
+ .offset = PCI_VPD_ADDR,
+ .size = 2,
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = vpd_address_write,
+ },
+ {
+ .offset = PCI_VPD_DATA,
+ .size = 4,
+ .u.dw.read = pciback_read_config_dword,
+ .u.dw.write = NULL,
+ },
+ {
+ .size = 0,
+ },
+};
+
+struct pciback_config_capability pciback_config_capability_vpd = {
+ .capability = PCI_CAP_ID_VPD,
+ .fields = caplist_vpd,
+};
--- /dev/null
+/*
+ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+struct pci_bar_info {
+ u32 val;
+ u32 len_val;
+ int which;
+};
+
+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
+
+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
+ int err;
+
+ if (!atomic_read(&dev->enable_cnt) && is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: enable\n",
+ pci_name(dev));
+ err = pci_enable_device(dev);
+ if (err)
+ return err;
+ } else if (atomic_read(&dev->enable_cnt) && !is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: disable\n",
+ pci_name(dev));
+ pci_disable_device(dev);
+ }
+
+ if (!dev->is_busmaster && is_master_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: set bus master\n",
+ pci_name(dev));
+ pci_set_master(dev);
+ }
+
+ if (value & PCI_COMMAND_INVALIDATE) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ "pciback: %s: enable memory-write-invalidate\n",
+ pci_name(dev));
+ err = pci_set_mwi(dev);
+ if (err) {
+ printk(KERN_WARNING
+ "pciback: %s: cannot enable memory-write-invalidate (%d)\n",
+ pci_name(dev), err);
+ value &= ~PCI_COMMAND_INVALIDATE;
+ }
+ }
+
+ return pci_write_config_word(dev, offset, value);
+}
+
+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~PCI_ROM_ADDRESS_ENABLE)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ /* Do we need to support enabling/disabling the rom address here? */
+
+ return 0;
+}
+
+/* For the BARs, only allow writes which write ~0 or
+ * the correct resource information
+ * (Needed for when the driver probes the resource usage)
+ */
+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~0)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ return 0;
+}
+
+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ *value = bar->which ? bar->len_val : bar->val;
+
+ return 0;
+}
+
+static inline void read_dev_bar(struct pci_dev *dev,
+ struct pci_bar_info *bar_info, int offset,
+ u32 len_mask)
+{
+ pci_read_config_dword(dev, offset, &bar_info->val);
+ pci_write_config_dword(dev, offset, len_mask);
+ pci_read_config_dword(dev, offset, &bar_info->len_val);
+ pci_write_config_dword(dev, offset, bar_info->val);
+}
+
+static void *bar_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~0);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void *rom_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ bar->which = 0;
+}
+
+static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
+ kfree(data);
+}
+
+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
+ void *data)
+{
+ *value = (u8) dev->irq;
+
+ return 0;
+}
+
+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
+{
+ u8 cur_value;
+ int err;
+
+ err = pci_read_config_byte(dev, offset, &cur_value);
+ if (err)
+ goto out;
+
+ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
+ || value == PCI_BIST_START)
+ err = pci_write_config_byte(dev, offset, value);
+
+ out:
+ return err;
+}
+
+static struct config_field header_common[] = {
+ {
+ .offset = PCI_COMMAND,
+ .size = 2,
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = command_write,
+ },
+ {
+ .offset = PCI_INTERRUPT_LINE,
+ .size = 1,
+ .u.b.read = interrupt_read,
+ },
+ {
+ .offset = PCI_INTERRUPT_PIN,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ /* Any side effects of letting driver domain control cache line? */
+ .offset = PCI_CACHE_LINE_SIZE,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ .u.b.write = pciback_write_config_byte,
+ },
+ {
+ .offset = PCI_LATENCY_TIMER,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ .offset = PCI_BIST,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ .u.b.write = bist_write,
+ },
+ {
+ .size = 0,
+ },
+};
+
+#define CFG_FIELD_BAR(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = bar_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = bar_write, \
+ }
+
+#define CFG_FIELD_ROM(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = rom_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = rom_write, \
+ }
+
+static struct config_field header_0[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS),
+ {
+ .size = 0,
+ },
+};
+
+static struct config_field header_1[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
+ {
+ .size = 0,
+ },
+};
+
+int pciback_config_header_add_fields(struct pci_dev *dev)
+{
+ int err;
+
+ err = pciback_config_add_fields(dev, header_common);
+ if (err)
+ goto out;
+
+ switch (dev->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL:
+ err = pciback_config_add_fields(dev, header_0);
+ break;
+
+ case PCI_HEADER_TYPE_BRIDGE:
+ err = pciback_config_add_fields(dev, header_1);
+ break;
+
+ default:
+ err = -EINVAL;
+ printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
+ pci_name(dev), dev->hdr_type);
+ break;
+ }
+
+ out:
+ return err;
+}
--- /dev/null
+/*
+ * PCI Backend - Handle special overlays for broken devices.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+LIST_HEAD(pciback_quirks);
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *tmp_quirk;
+
+ list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
+ if (pci_match_id(&tmp_quirk->devid, dev))
+ goto out;
+ tmp_quirk = NULL;
+ printk(KERN_DEBUG
+ "quirk didn't match any device pciback knows about\n");
+ out:
+ return tmp_quirk;
+}
+
+static inline void register_quirk(struct pciback_config_quirk *quirk)
+{
+ list_add_tail(&quirk->quirks_list, &pciback_quirks);
+}
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
+{
+ int ret = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ if ( OFFSET(cfg_entry) == reg) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field)
+{
+ int err = 0;
+
+ switch (field->size) {
+ case 1:
+ field->u.b.read = pciback_read_config_byte;
+ field->u.b.write = pciback_write_config_byte;
+ break;
+ case 2:
+ field->u.w.read = pciback_read_config_word;
+ field->u.w.write = pciback_write_config_word;
+ break;
+ case 4:
+ field->u.dw.read = pciback_read_config_dword;
+ field->u.dw.write = pciback_write_config_dword;
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ pciback_config_add_field(dev, field);
+
+ out:
+ return err;
+}
+
+int pciback_config_quirks_init(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
+ if (!quirk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ quirk->devid.vendor = dev->vendor;
+ quirk->devid.device = dev->device;
+ quirk->devid.subvendor = dev->subsystem_vendor;
+ quirk->devid.subdevice = dev->subsystem_device;
+ quirk->devid.class = 0;
+ quirk->devid.class_mask = 0;
+ quirk->devid.driver_data = 0UL;
+
+ quirk->pdev = dev;
+
+ register_quirk(quirk);
+ out:
+ return ret;
+}
+
+void pciback_config_field_free(struct config_field *field)
+{
+ kfree(field);
+}
+
+int pciback_config_quirk_release(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = pciback_find_quirk(dev);
+ if (!quirk) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ list_del(&quirk->quirks_list);
+ kfree(quirk);
+
+ out:
+ return ret;
+}
--- /dev/null
+/*
+ * PCI Backend - Data structures for special overlays for broken devices.
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_quirk {
+ struct list_head quirks_list;
+ struct pci_device_id devid;
+ struct pci_dev *pdev;
+};
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field);
+
+int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
+int pciback_config_quirks_init(struct pci_dev *dev);
+
+void pciback_config_field_free(struct config_field *field);
+
+int pciback_config_quirk_release(struct pci_dev *dev);
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ * Alex Williamson <alex.williamson@hp.com>
+ *
+ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
+ * controllers. Devices under the same PCI controller are exposed on the
+ * same virtual domain:bus. Within a bus, device slots are virtualized
+ * to compact the bus.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/acpi.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_MAX_BUSSES 255
+#define PCI_MAX_SLOTS 32
+
+struct controller_dev_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+ unsigned int devfn;
+};
+
+struct controller_list_entry {
+ struct list_head list;
+ struct pci_controller *controller;
+ unsigned int domain;
+ unsigned int bus;
+ unsigned int next_devfn;
+ struct list_head dev_list;
+};
+
+struct controller_dev_data {
+ struct list_head list;
+ unsigned int next_domain;
+ unsigned int next_bus;
+ spinlock_t lock;
+};
+
+struct walk_info {
+ struct pciback_device *pdev;
+ int resource_count;
+ int root_num;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ struct pci_dev *dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->domain != domain ||
+ cntrl_entry->bus != bus)
+ continue;
+
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if (devfn == dev_entry->devfn) {
+ dev = dev_entry->dev;
+ goto found;
+ }
+ }
+ }
+found:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
+ unsigned long flags;
+ int ret = 0, found = 0;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ /* Look to see if we already have a domain:bus for this controller */
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->controller == dev_controller) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
+ if (!cntrl_entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cntrl_entry->controller = dev_controller;
+ cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
+
+ cntrl_entry->domain = dev_data->next_domain;
+ cntrl_entry->bus = dev_data->next_bus++;
+ if (dev_data->next_bus > PCI_MAX_BUSSES) {
+ dev_data->next_domain++;
+ dev_data->next_bus = 0;
+ }
+
+ INIT_LIST_HEAD(&cntrl_entry->dev_list);
+
+ list_add_tail(&cntrl_entry->list, &dev_data->list);
+ }
+
+ if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
+ /*
+ * While it seems unlikely, this can actually happen if
+ * a controller has P2P bridges under it.
+ */
+ xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
+ "is full, no room to export %04x:%02x:%02x.%x",
+ cntrl_entry->domain, cntrl_entry->bus,
+ pci_domain_nr(dev->bus), dev->bus->number,
+ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
+ if (!dev_entry) {
+ if (list_empty(&cntrl_entry->dev_list)) {
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dev_entry->dev = dev;
+ dev_entry->devfn = cntrl_entry->next_devfn;
+
+ list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
+
+ cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
+
+out:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ /* TODO: Publish virtual domain:bus:slot.func here. */
+
+ return ret;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry;
+ struct controller_dev_entry *dev_entry = NULL;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->controller != PCI_CONTROLLER(dev))
+ continue;
+
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if (dev_entry->dev == dev) {
+ found_dev = dev_entry->dev;
+ break;
+ }
+ }
+ }
+
+ if (!found_dev) {
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ return;
+ }
+
+ list_del(&dev_entry->list);
+ kfree(dev_entry);
+
+ if (list_empty(&cntrl_entry->dev_list)) {
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ struct controller_dev_data *dev_data;
+
+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_data->lock);
+
+ INIT_LIST_HEAD(&dev_data->list);
+
+ /* Starting domain:bus numbers */
+ dev_data->next_domain = 0;
+ dev_data->next_bus = 0;
+
+ pdev->pci_dev_data = dev_data;
+
+ return 0;
+}
+
+static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
+{
+ struct walk_info *info = data;
+ struct acpi_resource_address64 addr;
+ acpi_status status;
+ int i, len, err;
+ char str[32], tmp[3];
+ unsigned char *ptr, *buf;
+
+ status = acpi_resource_to_address64(res, &addr);
+
+ /* Do we care about this range? Let's check. */
+ if (!ACPI_SUCCESS(status) ||
+ !(addr.resource_type == ACPI_MEMORY_RANGE ||
+ addr.resource_type == ACPI_IO_RANGE) ||
+ !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
+ return AE_OK;
+
+ /*
+ * Furthermore, we really only care to tell the guest about
+ * address ranges that require address translation of some sort.
+ */
+ if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
+ addr.info.mem.translation) &&
+ !(addr.resource_type == ACPI_IO_RANGE &&
+ addr.info.io.translation))
+ return AE_OK;
+
+ /* Store the resource in xenbus for the guest */
+ len = snprintf(str, sizeof(str), "root-%d-resource-%d",
+ info->root_num, info->resource_count);
+ if (unlikely(len >= (sizeof(str) - 1)))
+ return AE_OK;
+
+ buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
+ if (!buf)
+ return AE_OK;
+
+ /* Clean out resource_source */
+ res->data.address64.resource_source.index = 0xFF;
+ res->data.address64.resource_source.string_length = 0;
+ res->data.address64.resource_source.string_ptr = NULL;
+
+ ptr = (unsigned char *)res;
+
+ /* Turn the acpi_resource into an ASCII byte stream */
+ for (i = 0; i < sizeof(*res); i++) {
+ snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
+ strncat(buf, tmp, 2);
+ }
+
+ err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
+ str, "%s", buf);
+
+ if (!err)
+ info->resource_count++;
+
+ kfree(buf);
+
+ return AE_OK;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_root_cb)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry;
+ int i, root_num, len, err = 0;
+ unsigned int domain, bus;
+ char str[64];
+ struct walk_info info;
+
+ spin_lock(&dev_data->lock);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ /* First publish all the domain:bus info */
+ err = publish_root_cb(pdev, cntrl_entry->domain,
+ cntrl_entry->bus);
+ if (err)
+ goto out;
+
+ /*
+ * Now figure out which root-%d this belongs to
+ * so we can associate resources with it.
+ */
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", &root_num);
+
+ if (err != 1)
+ goto out;
+
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ str, "%x:%x", &domain, &bus);
+ if (err != 2)
+ goto out;
+
+ /* Is this the one we just published? */
+ if (domain == cntrl_entry->domain &&
+ bus == cntrl_entry->bus)
+ break;
+ }
+
+ if (i == root_num)
+ goto out;
+
+ info.pdev = pdev;
+ info.resource_count = 0;
+ info.root_num = i;
+
+ /* Let ACPI do the heavy lifting on decoding resources */
+ acpi_walk_resources(cntrl_entry->controller->acpi_handle,
+ METHOD_NAME__CRS, write_xenbus_resource,
+ &info);
+
+ /* No resouces. OK. On to the next one */
+ if (!info.resource_count)
+ continue;
+
+ /* Store the number of resources we wrote for this root-%d */
+ len = snprintf(str, sizeof(str), "root-%d-resources", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%d", info.resource_count);
+ if (err)
+ goto out;
+ }
+
+ /* Finally, write some magic to synchronize with the guest. */
+ len = snprintf(str, sizeof(str), "root-resource-magic");
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%lx", (sizeof(struct acpi_resource) * 2) + 1);
+
+out:
+ spin_unlock(&dev_data->lock);
+
+ return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry, *c;
+ struct controller_dev_entry *dev_entry, *d;
+
+ list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
+ list_for_each_entry_safe(dev_entry, d,
+ &cntrl_entry->dev_list, list) {
+ list_del(&dev_entry->list);
+ pcistub_put_pci_dev(dev_entry->dev);
+ kfree(dev_entry);
+ }
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+
+ kfree(dev_data);
+ pdev->pci_dev_data = NULL;
+}
--- /dev/null
+/*
+ * PCI Backend - Provides restricted access to the real PCI bus topology
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+struct passthrough_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list;
+ spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ struct pci_dev *dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
+ && bus == (unsigned int)dev_entry->dev->bus->number
+ && devfn == dev_entry->dev->devfn) {
+ dev = dev_entry->dev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ unsigned long flags;
+ unsigned int domain, bus, devfn;
+ int err;
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry)
+ return -ENOMEM;
+ dev_entry->dev = dev;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+ list_add_tail(&dev_entry->list, &dev_data->dev_list);
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ /* Publish this device. */
+ domain = (unsigned int)pci_domain_nr(dev->bus);
+ bus = (unsigned int)dev->bus->number;
+ devfn = dev->devfn;
+ err = publish_cb(pdev, domain, bus, devfn, devid);
+
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ if (dev_entry->dev == dev) {
+ list_del(&dev_entry->list);
+ found_dev = dev_entry->dev;
+ kfree(dev_entry);
+ }
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ struct passthrough_dev_data *dev_data;
+
+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_data->lock);
+
+ INIT_LIST_HEAD(&dev_data->dev_list);
+
+ pdev->pci_dev_data = dev_data;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_root_cb)
+{
+ int err = 0;
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *e;
+ struct pci_dev *dev;
+ int found;
+ unsigned int domain, bus;
+
+ spin_lock(&dev_data->lock);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ /* Only publish this device as a root if none of its
+ * parent bridges are exported
+ */
+ found = 0;
+ dev = dev_entry->dev->bus->self;
+ for (; !found && dev != NULL; dev = dev->bus->self) {
+ list_for_each_entry(e, &dev_data->dev_list, list) {
+ if (dev == e->dev) {
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
+ bus = (unsigned int)dev_entry->dev->bus->number;
+
+ if (!found) {
+ err = publish_root_cb(pdev, domain, bus);
+ if (err)
+ break;
+ }
+ }
+
+ spin_unlock(&dev_data->lock);
+
+ return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ list_del(&dev_entry->list);
+ pcistub_put_pci_dev(dev_entry->dev);
+ kfree(dev_entry);
+ }
+
+ kfree(dev_data);
+ pdev->pci_dev_data = NULL;
+}
--- /dev/null
+/*
+ * PCI Stub Driver - Grabs devices in backend to be exported later
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <asm/atomic.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+static char *pci_devs_to_hide = NULL;
+module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
+struct pcistub_device_id {
+ struct list_head slot_list;
+ int domain;
+ unsigned char bus;
+ unsigned int devfn;
+};
+static LIST_HEAD(pcistub_device_ids);
+static DEFINE_SPINLOCK(device_ids_lock);
+
+struct pcistub_device {
+ struct kref kref;
+ struct list_head dev_list;
+ spinlock_t lock;
+
+ struct pci_dev *dev;
+ struct pciback_device *pdev; /* non-NULL if struct pci_dev is in use */
+};
+
+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
+ * flag must be locked with pcistub_devices_lock
+ */
+static DEFINE_SPINLOCK(pcistub_devices_lock);
+static LIST_HEAD(pcistub_devices);
+
+/* wait for device_initcall before initializing our devices
+ * (see pcistub_init_devices_late)
+ */
+static int initialize_devices = 0;
+static LIST_HEAD(seized_devices);
+
+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+
+ dev_dbg(&dev->dev, "pcistub_device_alloc\n");
+
+ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
+ if (!psdev)
+ return NULL;
+
+ psdev->dev = pci_dev_get(dev);
+ if (!psdev->dev) {
+ kfree(psdev);
+ return NULL;
+ }
+
+ kref_init(&psdev->kref);
+ spin_lock_init(&psdev->lock);
+
+ return psdev;
+}
+
+/* Don't call this directly as it's called by pcistub_device_put */
+static void pcistub_device_release(struct kref *kref)
+{
+ struct pcistub_device *psdev;
+
+ psdev = container_of(kref, struct pcistub_device, kref);
+
+ dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
+
+ /* Clean-up the device */
+ pciback_reset_device(psdev->dev);
+ pciback_config_free_dyn_fields(psdev->dev);
+ pciback_config_free_dev(psdev->dev);
+ kfree(pci_get_drvdata(psdev->dev));
+ pci_set_drvdata(psdev->dev, NULL);
+
+ pci_dev_put(psdev->dev);
+
+ kfree(psdev);
+}
+
+static inline void pcistub_device_get(struct pcistub_device *psdev)
+{
+ kref_get(&psdev->kref);
+}
+
+static inline void pcistub_device_put(struct pcistub_device *psdev)
+{
+ kref_put(&psdev->kref, pcistub_device_release);
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+ pcistub_device_get(psdev);
+ goto out;
+ }
+ }
+
+ /* didn't find it */
+ psdev = NULL;
+
+ out:
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return psdev;
+}
+
+static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
+ struct pcistub_device *psdev)
+{
+ struct pci_dev *pci_dev = NULL;
+ unsigned long flags;
+
+ pcistub_device_get(psdev);
+
+ spin_lock_irqsave(&psdev->lock, flags);
+ if (!psdev->pdev) {
+ psdev->pdev = pdev;
+ pci_dev = psdev->dev;
+ }
+ spin_unlock_irqrestore(&psdev->lock, flags);
+
+ if (!pci_dev)
+ pcistub_device_put(psdev);
+
+ return pci_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+ int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+ struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+void pcistub_put_pci_dev(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* Cleanup our device
+ * (so it's ready for the next domain)
+ */
+ pciback_reset_device(found_psdev->dev);
+ pciback_config_free_dyn_fields(found_psdev->dev);
+ pciback_config_reset_dev(found_psdev->dev);
+
+ spin_lock_irqsave(&found_psdev->lock, flags);
+ found_psdev->pdev = NULL;
+ spin_unlock_irqrestore(&found_psdev->lock, flags);
+
+ pcistub_device_put(found_psdev);
+}
+
+static int __devinit pcistub_match_one(struct pci_dev *dev,
+ struct pcistub_device_id *pdev_id)
+{
+ /* Match the specified device by domain, bus, slot, func and also if
+ * any of the device's parent bridges match.
+ */
+ for (; dev != NULL; dev = dev->bus->self) {
+ if (pci_domain_nr(dev->bus) == pdev_id->domain
+ && dev->bus->number == pdev_id->bus
+ && dev->devfn == pdev_id->devfn)
+ return 1;
+
+ /* Sometimes topmost bridge links to itself. */
+ if (dev == dev->bus->self)
+ break;
+ }
+
+ return 0;
+}
+
+static int __devinit pcistub_match(struct pci_dev *dev)
+{
+ struct pcistub_device_id *pdev_id;
+ unsigned long flags;
+ int found = 0;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
+ if (pcistub_match_one(dev, pdev_id)) {
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return found;
+}
+
+static int __devinit pcistub_init_device(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data;
+ int err = 0;
+
+ dev_dbg(&dev->dev, "initializing...\n");
+
+ /* The PCI backend is not intended to be a module (or to work with
+ * removable PCI devices (yet). If it were, pciback_config_free()
+ * would need to be called somewhere to free the memory allocated
+ * here and then to call kfree(pci_get_drvdata(psdev->dev)).
+ */
+ dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
+ if (!dev_data) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pci_set_drvdata(dev, dev_data);
+
+ dev_dbg(&dev->dev, "initializing config\n");
+ err = pciback_config_init_dev(dev);
+ if (err)
+ goto out;
+
+ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
+ * must do this here because pcibios_enable_device may specify
+ * the pci device's true irq (and possibly its other resources)
+ * if they differ from what's in the configuration space.
+ * This makes the assumption that the device's resources won't
+ * change after this point (otherwise this code may break!)
+ */
+ dev_dbg(&dev->dev, "enabling device\n");
+ err = pci_enable_device(dev);
+ if (err)
+ goto config_release;
+
+ /* Now disable the device (this also ensures some private device
+ * data is setup before we export)
+ */
+ dev_dbg(&dev->dev, "reset device\n");
+ pciback_reset_device(dev);
+
+ return 0;
+
+ config_release:
+ pciback_config_free_dev(dev);
+
+ out:
+ pci_set_drvdata(dev, NULL);
+ kfree(dev_data);
+ return err;
+}
+
+/*
+ * Because some initialization still happens on
+ * devices during fs_initcall, we need to defer
+ * full initialization of our devices until
+ * device_initcall.
+ */
+static int __init pcistub_init_devices_late(void)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ pr_debug("pciback: pcistub_init_devices_late\n");
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ while (!list_empty(&seized_devices)) {
+ psdev = container_of(seized_devices.next,
+ struct pcistub_device, dev_list);
+ list_del(&psdev->dev_list);
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ err = pcistub_init_device(psdev->dev);
+ if (err) {
+ dev_err(&psdev->dev->dev,
+ "error %d initializing device\n", err);
+ kfree(psdev);
+ psdev = NULL;
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (psdev)
+ list_add_tail(&psdev->dev_list, &pcistub_devices);
+ }
+
+ initialize_devices = 1;
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ return 0;
+}
+
+static int __devinit pcistub_seize(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ psdev = pcistub_device_alloc(dev);
+ if (!psdev)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (initialize_devices) {
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* don't want irqs disabled when calling pcistub_init_device */
+ err = pcistub_init_device(psdev->dev);
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (!err)
+ list_add(&psdev->dev_list, &pcistub_devices);
+ } else {
+ dev_dbg(&dev->dev, "deferring initialization\n");
+ list_add(&psdev->dev_list, &seized_devices);
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (err)
+ pcistub_device_put(psdev);
+
+ return err;
+}
+
+static int __devinit pcistub_probe(struct pci_dev *dev,
+ const struct pci_device_id *id)
+{
+ int err = 0;
+
+ dev_dbg(&dev->dev, "probing...\n");
+
+ if (pcistub_match(dev)) {
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
+ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+ dev_err(&dev->dev, "can't export pci devices that "
+ "don't have a normal (0) or bridge (1) "
+ "header type!\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ dev_info(&dev->dev, "seizing device\n");
+ err = pcistub_seize(dev);
+ } else
+ /* Didn't find the device */
+ err = -ENODEV;
+
+ out:
+ return err;
+}
+
+static void pcistub_remove(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ dev_dbg(&dev->dev, "removing\n");
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ pciback_config_quirk_release(dev);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (found_psdev) {
+ dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
+ found_psdev->pdev);
+
+ if (found_psdev->pdev) {
+ printk(KERN_WARNING "pciback: ****** removing device "
+ "%s while still in-use! ******\n",
+ pci_name(found_psdev->dev));
+ printk(KERN_WARNING "pciback: ****** driver domain may "
+ "still access this device's i/o resources!\n");
+ printk(KERN_WARNING "pciback: ****** shutdown driver "
+ "domain before binding device\n");
+ printk(KERN_WARNING "pciback: ****** to other drivers "
+ "or domains\n");
+
+ pciback_release_pci_dev(found_psdev->pdev,
+ found_psdev->dev);
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_del(&found_psdev->dev_list);
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* the final put for releasing from the list */
+ pcistub_device_put(found_psdev);
+ }
+}
+
+static struct pci_device_id pcistub_ids[] = {
+ {
+ .vendor = PCI_ANY_ID,
+ .device = PCI_ANY_ID,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ },
+ {0,},
+};
+
+/*
+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
+ * for a normal device. I don't want it to be loaded automatically.
+ */
+
+static struct pci_driver pciback_pci_driver = {
+ .name = "pciback",
+ .id_table = pcistub_ids,
+ .probe = pcistub_probe,
+ .remove = pcistub_remove,
+};
+
+static inline int str_to_slot(const char *buf, int *domain, int *bus,
+ int *slot, int *func)
+{
+ int err;
+
+ err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
+ if (err == 4)
+ return 0;
+ else if (err < 0)
+ return -EINVAL;
+
+ /* try again without domain */
+ *domain = 0;
+ err = sscanf(buf, " %x:%x.%x", bus, slot, func);
+ if (err == 3)
+ return 0;
+
+ return -EINVAL;
+}
+
+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
+ *slot, int *func, int *reg, int *size, int *mask)
+{
+ int err;
+
+ err =
+ sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
+ func, reg, size, mask);
+ if (err == 7)
+ return 0;
+ return -EINVAL;
+}
+
+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id;
+ unsigned long flags;
+
+ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+ if (!pci_dev_id)
+ return -ENOMEM;
+
+ pci_dev_id->domain = domain;
+ pci_dev_id->bus = bus;
+ pci_dev_id->devfn = PCI_DEVFN(slot, func);
+
+ pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
+ domain, bus, slot, func);
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return 0;
+}
+
+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id, *t;
+ int devfn = PCI_DEVFN(slot, func);
+ int err = -ENOENT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
+
+ if (pci_dev_id->domain == domain
+ && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
+ /* Don't break; here because it's possible the same
+ * slot could be in the list more than once
+ */
+ list_del(&pci_dev_id->slot_list);
+ kfree(pci_dev_id);
+
+ err = 0;
+
+ pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
+ "seize list\n", domain, bus, slot, func);
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return err;
+}
+
+static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
+ int size, int mask)
+{
+ int err = 0;
+ struct pcistub_device *psdev;
+ struct pci_dev *dev;
+ struct config_field *field;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev || !psdev->dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ dev = psdev->dev;
+
+ field = kzalloc(sizeof(*field), GFP_ATOMIC);
+ if (!field) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ field->offset = reg;
+ field->size = size;
+ field->mask = mask;
+ field->init = NULL;
+ field->reset = NULL;
+ field->release = NULL;
+ field->clean = pciback_config_field_free;
+
+ err = pciback_config_quirks_add_field(dev, field);
+ if (err)
+ kfree(field);
+ out:
+ return err;
+}
+
+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+
+ out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
+
+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_remove(domain, bus, slot, func);
+
+ out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
+
+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device_id *pci_dev_id;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+ if (count >= PAGE_SIZE)
+ break;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%04x:%02x:%02x.%01x\n",
+ pci_dev_id->domain, pci_dev_id->bus,
+ PCI_SLOT(pci_dev_id->devfn),
+ PCI_FUNC(pci_dev_id->devfn));
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+
+DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+
+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func, reg, size, mask;
+ int err;
+
+ err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size,
+ &mask);
+ if (err)
+ goto out;
+
+ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
+
+ out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
+{
+ int count = 0;
+ unsigned long flags;
+ extern struct list_head pciback_quirks;
+ struct pciback_config_quirk *quirk;
+ struct pciback_dev_data *dev_data;
+ struct config_field *field;
+ struct config_field_entry *cfg_entry;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
+ quirk->pdev->bus->number,
+ PCI_SLOT(quirk->pdev->devfn),
+ PCI_FUNC(quirk->pdev->devfn),
+ quirk->devid.vendor, quirk->devid.device,
+ quirk->devid.subvendor,
+ quirk->devid.subdevice);
+
+ dev_data = pci_get_drvdata(quirk->pdev);
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "\t\t%08x:%01x:%08x\n",
+ cfg_entry->base_offset + field->offset,
+ field->size, field->mask);
+ }
+ }
+
+ out:
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+
+DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
+
+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (!psdev->dev) {
+ err = -ENODEV;
+ goto release;
+ }
+ dev_data = pci_get_drvdata(psdev->dev);
+ /* the driver data for a device should never be null at this point */
+ if (!dev_data) {
+ err = -ENXIO;
+ goto release;
+ }
+ if (!dev_data->permissive) {
+ dev_data->permissive = 1;
+ /* Let user know that what they're doing could be unsafe */
+ dev_warn(&psdev->dev->dev,
+ "enabling permissive mode configuration space accesses!\n");
+ dev_warn(&psdev->dev->dev,
+ "permissive mode is potentially unsafe!\n");
+ }
+ release:
+ pcistub_device_put(psdev);
+ out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t permissive_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data || !dev_data->permissive)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+ pci_name(psdev->dev));
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+
+DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
+
+static void pcistub_exit(void)
+{
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
+ driver_remove_file(&pciback_pci_driver.driver,
+ &driver_attr_remove_slot);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
+
+ pci_unregister_driver(&pciback_pci_driver);
+}
+
+static int __init pcistub_init(void)
+{
+ int pos = 0;
+ int err = 0;
+ int domain, bus, slot, func;
+ int parsed;
+
+ if (pci_devs_to_hide && *pci_devs_to_hide) {
+ do {
+ parsed = 0;
+
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x:%x.%x) %n",
+ &domain, &bus, &slot, &func, &parsed);
+ if (err != 4) {
+ domain = 0;
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x.%x) %n",
+ &bus, &slot, &func, &parsed);
+ if (err != 3)
+ goto parse_error;
+ }
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+ if (err)
+ goto out;
+
+ /* if parsed<=0, we've reached the end of the string */
+ pos += parsed;
+ } while (parsed > 0 && pci_devs_to_hide[pos]);
+ }
+
+ /* If we're the first PCI Device Driver to register, we're the
+ * first one to get offered PCI devices as they become
+ * available (and thus we can be the first to grab them)
+ */
+ err = pci_register_driver(&pciback_pci_driver);
+ if (err < 0)
+ goto out;
+
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_new_slot);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_remove_slot);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_slots);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_quirks);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_permissive);
+
+ if (err)
+ pcistub_exit();
+
+ out:
+ return err;
+
+ parse_error:
+ printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
+ pci_devs_to_hide + pos);
+ return -EINVAL;
+}
+
+#ifndef MODULE
+/*
+ * fs_initcall happens before device_initcall
+ * so pciback *should* get called first (b/c we
+ * want to suck up any device before other drivers
+ * get a chance by being the first pci device
+ * driver to register)
+ */
+fs_initcall(pcistub_init);
+#endif
+
+static int __init pciback_init(void)
+{
+ int err;
+
+ err = pciback_config_init();
+ if (err)
+ return err;
+
+#ifdef MODULE
+ err = pcistub_init();
+ if (err < 0)
+ return err;
+#endif
+
+ pcistub_init_devices_late();
+ err = pciback_xenbus_register();
+ if (err)
+ pcistub_exit();
+
+ return err;
+}
+
+static void __exit pciback_cleanup(void)
+{
+ pciback_xenbus_unregister();
+ pcistub_exit();
+}
+
+module_init(pciback_init);
+module_exit(pciback_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/*
+ * PCI Backend Common Data Structures & Function Declarations
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIBACK_H__
+#define __XEN_PCIBACK_H__
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <xen/xenbus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <asm/atomic.h>
+#include <xen/interface/io/pciif.h>
+
+struct pci_dev_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+
+#define _PDEVF_op_active (0)
+#define PDEVF_op_active (1<<(_PDEVF_op_active))
+
+struct pciback_device {
+ void *pci_dev_data;
+ spinlock_t dev_lock;
+
+ struct xenbus_device *xdev;
+
+ struct xenbus_watch be_watch;
+ u8 be_watching;
+
+ int evtchn_irq;
+
+ struct vm_struct *sh_area;
+ struct xen_pci_sharedinfo *sh_info;
+
+ unsigned long flags;
+
+ struct work_struct op_work;
+};
+
+struct pciback_dev_data {
+ struct list_head config_fields;
+ int permissive;
+ int warned_on_write;
+};
+
+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+ int domain, int bus,
+ int slot, int func);
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+ struct pci_dev *dev);
+void pcistub_put_pci_dev(struct pci_dev *dev);
+
+/* Ensure a device is turned off or reset */
+void pciback_reset_device(struct pci_dev *pdev);
+
+/* Access a virtual configuration space for a PCI device */
+int pciback_config_init(void);
+int pciback_config_init_dev(struct pci_dev *dev);
+void pciback_config_free_dyn_fields(struct pci_dev *dev);
+void pciback_config_reset_dev(struct pci_dev *dev);
+void pciback_config_free_dev(struct pci_dev *dev);
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+ u32 * ret_val);
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
+
+/* Handle requests for specific devices from the frontend */
+typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid);
+typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
+ unsigned int domain, unsigned int bus);
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb);
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn);
+int pciback_init_devices(struct pciback_device *pdev);
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb cb);
+void pciback_release_devices(struct pciback_device *pdev);
+
+/* Handles events from front-end */
+irqreturn_t pciback_handle_event(int irq, void *dev_id);
+void pciback_do_op(struct work_struct *work);
+
+int pciback_xenbus_register(void);
+void pciback_xenbus_unregister(void);
+
+extern int verbose_request;
+#endif
--- /dev/null
+/*
+ * PCI Backend Operations - respond to PCI requests from Frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <asm/bitops.h>
+#include <xen/evtchn.h>
+#include "pciback.h"
+
+int verbose_request = 0;
+module_param(verbose_request, int, 0644);
+
+/* Ensure a device is "turned off" and ready to be exported.
+ * (Also see pciback_config_reset to ensure virtual configuration space is
+ * ready to be re-exported)
+ */
+void pciback_reset_device(struct pci_dev *dev)
+{
+ u16 cmd;
+
+ /* Disable devices (but not bridges) */
+ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+ pci_disable_device(dev);
+
+ pci_write_config_word(dev, PCI_COMMAND, 0);
+
+ atomic_set(&dev->enable_cnt, 0);
+ dev->is_busmaster = 0;
+ } else {
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+ if (cmd & (PCI_COMMAND_INVALIDATE)) {
+ cmd &= ~(PCI_COMMAND_INVALIDATE);
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+ dev->is_busmaster = 0;
+ }
+ }
+}
+
+static inline void test_and_schedule_op(struct pciback_device *pdev)
+{
+ /* Check that frontend is requesting an operation and that we are not
+ * already processing a request */
+ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
+ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags))
+ schedule_work(&pdev->op_work);
+}
+
+/* Performing the configuration space reads/writes must not be done in atomic
+ * context because some of the pci_* functions can sleep (mostly due to ACPI
+ * use of semaphores). This function is intended to be called from a work
+ * queue in process context taking a struct pciback_device as a parameter */
+void pciback_do_op(struct work_struct *work)
+{
+ struct pciback_device *pdev = container_of(work, struct pciback_device, op_work);
+ struct pci_dev *dev;
+ struct xen_pci_op *op = &pdev->sh_info->op;
+
+ dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
+ if (dev == NULL)
+ op->err = XEN_PCI_ERR_dev_not_found;
+ else if (op->cmd == XEN_PCI_OP_conf_read)
+ op->err = pciback_config_read(dev, op->offset, op->size,
+ &op->value);
+ else if (op->cmd == XEN_PCI_OP_conf_write)
+ op->err = pciback_config_write(dev, op->offset, op->size,
+ op->value);
+ else
+ op->err = XEN_PCI_ERR_not_implemented;
+
+ /* Tell the driver domain that we're done. */
+ wmb();
+ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+ notify_remote_via_irq(pdev->evtchn_irq);
+
+ /* Mark that we're done. */
+ smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
+ clear_bit(_PDEVF_op_active, &pdev->flags);
+ smp_mb__after_clear_bit(); /* /before/ final check for work */
+
+ /* Check to see if the driver domain tried to start another request in
+ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. */
+ test_and_schedule_op(pdev);
+}
+
+irqreturn_t pciback_handle_event(int irq, void *dev_id)
+{
+ struct pciback_device *pdev = dev_id;
+
+ test_and_schedule_op(pdev);
+
+ return IRQ_HANDLED;
+}
--- /dev/null
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
+ * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+/* There are at most 32 slots in a pci bus. */
+#define PCI_SLOT_MAX 32
+
+#define PCI_BUS_NBR 2
+
+struct slot_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
+ spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct pci_dev *dev = NULL;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if (domain != 0 || PCI_FUNC(devfn) != 0)
+ return NULL;
+
+ if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
+ return NULL;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+ dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ int err = 0, slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+ err = -EFAULT;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Can't export bridges on the virtual PCI bus");
+ goto out;
+ }
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ /* Assign to a new slot on the virtual PCI bus */
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (slot_dev->slots[bus][slot] == NULL) {
+ printk(KERN_INFO
+ "pciback: slot: %s: assign to virtual slot %d, bus %d\n",
+ pci_name(dev), slot, bus);
+ slot_dev->slots[bus][slot] = dev;
+ goto unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "No more space on root virtual PCI bus");
+
+ unlock:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ /* Publish this device. */
+ if(!err)
+ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
+
+ out:
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (slot_dev->slots[bus][slot] == dev) {
+ slot_dev->slots[bus][slot] = NULL;
+ found_dev = dev;
+ goto out;
+ }
+ }
+
+ out:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev;
+
+ slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
+ if (!slot_dev)
+ return -ENOMEM;
+
+ spin_lock_init(&slot_dev->lock);
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+ slot_dev->slots[bus][slot] = NULL;
+
+ pdev->pci_dev_data = slot_dev;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_cb)
+{
+ /* The Virtual PCI bus has only one root */
+ return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *dev;
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ dev = slot_dev->slots[bus][slot];
+ if (dev != NULL)
+ pcistub_put_pci_dev(dev);
+ }
+
+ kfree(slot_dev);
+ pdev->pci_dev_data = NULL;
+}
--- /dev/null
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_SLOT_MAX 32
+
+struct vpci_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list[PCI_SLOT_MAX];
+ spinlock_t lock;
+};
+
+static inline struct list_head *list_first(struct list_head *head)
+{
+ return head->next;
+}
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct pci_dev_entry *entry;
+ struct pci_dev *dev = NULL;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if (domain != 0 || bus != 0)
+ return NULL;
+
+ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ list_for_each_entry(entry,
+ &vpci_dev->dev_list[PCI_SLOT(devfn)],
+ list) {
+ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
+ dev = entry->dev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+ }
+ return dev;
+}
+
+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
+{
+ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
+ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
+ return 1;
+
+ return 0;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ int err = 0, slot, func;
+ struct pci_dev_entry *t, *dev_entry;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+ err = -EFAULT;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Can't export bridges on the virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error adding entry to virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry->dev = dev;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ /* Keep multi-function devices together on the virtual PCI bus */
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (!list_empty(&vpci_dev->dev_list[slot])) {
+ t = list_entry(list_first(&vpci_dev->dev_list[slot]),
+ struct pci_dev_entry, list);
+
+ if (match_slot(dev, t->dev)) {
+ pr_info("pciback: vpci: %s: "
+ "assign to virtual slot %d func %d\n",
+ pci_name(dev), slot,
+ PCI_FUNC(dev->devfn));
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+ }
+
+ /* Assign to a new slot on the virtual PCI bus */
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (list_empty(&vpci_dev->dev_list[slot])) {
+ printk(KERN_INFO
+ "pciback: vpci: %s: assign to virtual slot %d\n",
+ pci_name(dev), slot);
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "No more space on root virtual PCI bus");
+
+ unlock:
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+ /* Publish this device. */
+ if(!err)
+ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+
+ out:
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e, *tmp;
+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+ list) {
+ if (e->dev == dev) {
+ list_del(&e->list);
+ found_dev = e->dev;
+ kfree(e);
+ goto out;
+ }
+ }
+ }
+
+ out:
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev;
+
+ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
+ if (!vpci_dev)
+ return -ENOMEM;
+
+ spin_lock_init(&vpci_dev->lock);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+ }
+
+ pdev->pci_dev_data = vpci_dev;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_cb)
+{
+ /* The Virtual PCI bus has only one root */
+ return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e, *tmp;
+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+ list) {
+ list_del(&e->list);
+ pcistub_put_pci_dev(e->dev);
+ kfree(e);
+ }
+ }
+
+ kfree(vpci_dev);
+ pdev->pci_dev_data = NULL;
+}
--- /dev/null
+/*
+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include "pciback.h"
+
+#define INVALID_EVTCHN_IRQ (-1)
+
+static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
+{
+ struct pciback_device *pdev;
+
+ pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
+ if (pdev == NULL)
+ goto out;
+ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
+ pdev->xdev = xdev;
+ xdev->dev.driver_data = pdev;
+
+ spin_lock_init(&pdev->dev_lock);
+
+ pdev->sh_area = NULL;
+ pdev->sh_info = NULL;
+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+ pdev->be_watching = 0;
+
+ INIT_WORK(&pdev->op_work, pciback_do_op);
+
+ if (pciback_init_devices(pdev)) {
+ kfree(pdev);
+ pdev = NULL;
+ }
+ out:
+ return pdev;
+}
+
+static void free_pdev(struct pciback_device *pdev)
+{
+ if (pdev->be_watching)
+ unregister_xenbus_watch(&pdev->be_watch);
+
+ /* Ensure the guest can't trigger our handler before removing devices */
+ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ)
+ unbind_from_irqhandler(pdev->evtchn_irq, pdev);
+
+ /* If the driver domain started an op, make sure we complete it or
+ * delete it before releasing the shared memory */
+ flush_scheduled_work();
+
+ if (pdev->sh_info)
+ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
+
+ pciback_release_devices(pdev);
+
+ pdev->xdev->dev.driver_data = NULL;
+ pdev->xdev = NULL;
+
+ kfree(pdev);
+}
+
+static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
+ int remote_evtchn)
+{
+ int err = 0;
+ struct vm_struct *area;
+
+ dev_dbg(&pdev->xdev->dev,
+ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+ gnt_ref, remote_evtchn);
+
+ area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
+ if (IS_ERR(area)) {
+ err = PTR_ERR(area);
+ goto out;
+ }
+ pdev->sh_area = area;
+ pdev->sh_info = area->addr;
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
+ IRQF_SAMPLE_RANDOM, "pciback", pdev);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error binding event channel to IRQ");
+ goto out;
+ }
+ pdev->evtchn_irq = err;
+ err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "Attached!\n");
+ out:
+ return err;
+}
+
+static int pciback_attach(struct pciback_device *pdev)
+{
+ int err = 0;
+ int gnt_ref, remote_evtchn;
+ char *magic = NULL;
+
+ spin_lock(&pdev->dev_lock);
+
+ /* Make sure we only do this setup once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitialised)
+ goto out;
+
+ /* Wait for frontend to state that it has published the configuration */
+ if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+ XenbusStateInitialised)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
+ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
+ "pci-op-ref", "%u", &gnt_ref,
+ "event-channel", "%u", &remote_evtchn,
+ "magic", NULL, &magic, NULL);
+ if (err) {
+ /* If configuration didn't get read correctly, wait longer */
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading configuration from frontend");
+ goto out;
+ }
+
+ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
+ xenbus_dev_fatal(pdev->xdev, -EFAULT,
+ "version mismatch (%s/%s) with pcifront - "
+ "halting pciback",
+ magic, XEN_PCI_MAGIC);
+ goto out;
+ }
+
+ err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
+ if (err)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to connected state!");
+
+ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
+ out:
+ spin_unlock(&pdev->dev_lock);
+
+ if (magic)
+ kfree(magic);
+
+ return err;
+}
+
+static int pciback_publish_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid)
+{
+ int err;
+ int len;
+ char str[64];
+
+ len = snprintf(str, sizeof(str), "vdev-%d", devid);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x:%02x.%02x", domain, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+ out:
+ return err;
+}
+
+static int pciback_export_device(struct pciback_device *pdev,
+ int domain, int bus, int slot, int func,
+ int devid)
+{
+ struct pci_dev *dev;
+ int err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
+ if (!dev) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%01x)! "
+ "perhaps already in-use?",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
+ if (err)
+ goto out;
+
+ /* TODO: It'd be nice to export a bridge and have all of its children
+ * get exported with it. This may be best done in xend (which will
+ * have to calculate resource usage anyway) but we probably want to
+ * put something in here to ensure that if a bridge gets given to a
+ * driver domain, that all devices under that bridge are not given
+ * to other driver domains (as he who controls the bridge can disable
+ * it and stop the other devices from working).
+ */
+ out:
+ return err;
+}
+
+static int pciback_remove_device(struct pciback_device *pdev,
+ int domain, int bus, int slot, int func)
+{
+ int err = 0;
+ struct pci_dev *dev;
+
+ dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
+ if (!dev) {
+ err = -EINVAL;
+ dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%01x)! not owned by this domain\n",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ pciback_release_pci_dev(pdev, dev);
+
+ out:
+ return err;
+}
+
+static int pciback_publish_pci_root(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ unsigned int d, b;
+ int i, root_num, len, err;
+ char str[64];
+
+ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", &root_num);
+ if (err == 0 || err == -ENOENT)
+ root_num = 0;
+ else if (err < 0)
+ goto out;
+
+ /* Verify that we haven't already published this pci root */
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ str, "%x:%x", &d, &b);
+ if (err < 0)
+ goto out;
+ if (err != 2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (d == domain && b == bus) {
+ err = 0;
+ goto out;
+ }
+ }
+
+ len = snprintf(str, sizeof(str), "root-%d", root_num);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
+ root_num, domain, bus);
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x", domain, bus);
+ if (err)
+ goto out;
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", (root_num + 1));
+
+ out:
+ return err;
+}
+
+static int pciback_reconfigure(struct pciback_device *pdev)
+{
+ int err = 0;
+ int num_devs;
+ int domain, bus, slot, func;
+ int substate;
+ int i, len;
+ char state_str[64];
+ char dev_str[64];
+
+ spin_lock(&pdev->dev_lock);
+
+ dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
+
+ /* Make sure we only reconfigure once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateReconfiguring)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ len = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(len >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", &substate);
+ if (err != 1)
+ substate = XenbusStateUnknown;
+
+ switch (substate) {
+ case XenbusStateInitialising:
+ dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_export_device(pdev, domain, bus, slot,
+ func, i);
+ if (err)
+ goto out;
+
+ /* Publish pci roots. */
+ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root"
+ "buses for frontend");
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ state_str, "%d",
+ XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching substate of "
+ "dev-%d\n", i);
+ goto out;
+ }
+ break;
+
+ case XenbusStateClosing:
+ dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_remove_device(pdev, domain, bus, slot,
+ func);
+ if(err)
+ goto out;
+
+ /* TODO: If at some point we implement support for pci
+ * root hot-remove on pcifront side, we'll need to
+ * remove unnecessary xenstore nodes of pci roots here.
+ */
+
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to reconfigured state!");
+ goto out;
+ }
+
+ out:
+ spin_unlock(&pdev->dev_lock);
+
+ return 0;
+}
+
+static void pciback_frontend_changed(struct xenbus_device *xdev,
+ enum xenbus_state fe_state)
+{
+ struct pciback_device *pdev = xdev->dev.driver_data;
+
+ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
+ switch (fe_state) {
+ case XenbusStateInitialised:
+ pciback_attach(pdev);
+ break;
+
+ case XenbusStateClosing:
+ xenbus_switch_state(xdev, XenbusStateClosed);
+ break;
+
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
+ device_unregister(&xdev->dev);
+ break;
+
+ case XenbusStateReconfiguring:
+ pciback_reconfigure(pdev);
+ break;
+
+ case XenbusStateConnected:
+ /* pcifront switched its state from reconfiguring to connected.
+ * Then switch to connected state.
+ */
+ xenbus_switch_state(xdev, XenbusStateConnected);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int pciback_setup_backend(struct pciback_device *pdev)
+{
+ /* Get configuration from xend (if available now) */
+ int domain, bus, slot, func;
+ int err = 0;
+ int i, num_devs;
+ char dev_str[64];
+ char state_str[64];
+
+ spin_lock(&pdev->dev_lock);
+
+ /* It's possible we could get the call to setup twice, so make sure
+ * we're not already connected.
+ */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitWait)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(l >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
+ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_export_device(pdev, domain, bus, slot, func, i);
+ if (err)
+ goto out;
+
+ /* Switch substate of this device. */
+ l = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(l >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err, "Error switching "
+ "substate of dev-%d\n", i);
+ goto out;
+ }
+ }
+
+ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root buses "
+ "for frontend");
+ goto out;
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to initialised state!");
+
+ out:
+ spin_unlock(&pdev->dev_lock);
+
+ if (!err)
+ /* see if pcifront is already configured (if not, we'll wait) */
+ pciback_attach(pdev);
+
+ return err;
+}
+
+static void pciback_be_watch(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ struct pciback_device *pdev =
+ container_of(watch, struct pciback_device, be_watch);
+
+ switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+ case XenbusStateInitWait:
+ pciback_setup_backend(pdev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int pciback_xenbus_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err = 0;
+ struct pciback_device *pdev = alloc_pdev(dev);
+
+ if (pdev == NULL) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(dev, err,
+ "Error allocating pciback_device struct");
+ goto out;
+ }
+
+ /* wait for xend to configure us */
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto out;
+
+ /* watch the backend node for backend configuration information */
+ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
+ pciback_be_watch);
+ if (err)
+ goto out;
+ pdev->be_watching = 1;
+
+ /* We need to force a call to our callback here in case
+ * xend already configured us!
+ */
+ pciback_be_watch(&pdev->be_watch, NULL, 0);
+
+ out:
+ return err;
+}
+
+static int pciback_xenbus_remove(struct xenbus_device *dev)
+{
+ struct pciback_device *pdev = dev->dev.driver_data;
+
+ if (pdev != NULL)
+ free_pdev(pdev);
+
+ return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+ {"pci"},
+ {{0}},
+};
+
+static struct xenbus_driver xenbus_pciback_driver = {
+ .name = "pciback",
+ .ids = xenpci_ids,
+ .probe = pciback_xenbus_probe,
+ .remove = pciback_xenbus_remove,
+ .otherend_changed = pciback_frontend_changed,
+};
+
+int __init pciback_xenbus_register(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ return xenbus_register_backend(&xenbus_pciback_driver);
+}
+
+void __exit pciback_xenbus_unregister(void)
+{
+ xenbus_unregister_driver(&xenbus_pciback_driver);
+}
--- /dev/null
+obj-y += pcifront.o
+
+pcifront-y := pci_op.o xenbus.o pci.o
+
+ccflags-$(CONFIG_XEN_PCIDEV_FE_DEBUG) += -DDEBUG
--- /dev/null
+/*
+ * PCI Frontend Operations - ensure only one PCI frontend runs at a time
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pcifront.h"
+
+DEFINE_SPINLOCK(pcifront_dev_lock);
+static struct pcifront_device *pcifront_dev = NULL;
+
+int pcifront_connect(struct pcifront_device *pdev)
+{
+ int err = 0;
+
+ spin_lock(&pcifront_dev_lock);
+
+ if (!pcifront_dev) {
+ dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
+ pcifront_dev = pdev;
+ }
+ else {
+ dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
+ err = -EEXIST;
+ }
+
+ spin_unlock(&pcifront_dev_lock);
+
+ return err;
+}
+
+void pcifront_disconnect(struct pcifront_device *pdev)
+{
+ spin_lock(&pcifront_dev_lock);
+
+ if (pdev == pcifront_dev) {
+ dev_info(&pdev->xdev->dev,
+ "Disconnecting PCI Frontend Buses\n");
+ pcifront_dev = NULL;
+ }
+
+ spin_unlock(&pcifront_dev_lock);
+}
--- /dev/null
+/*
+ * PCI Frontend Operations - Communicates with frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <xen/evtchn.h>
+#include "pcifront.h"
+
+static int verbose_request = 0;
+module_param(verbose_request, int, 0644);
+
+#ifdef __ia64__
+static void pcifront_init_sd(struct pcifront_sd *sd,
+ unsigned int domain, unsigned int bus,
+ struct pcifront_device *pdev)
+{
+ int err, i, j, k, len, root_num, res_count;
+ struct acpi_resource res;
+ unsigned int d, b, byte;
+ unsigned long magic;
+ char str[64], tmp[3];
+ unsigned char *buf, *bufp;
+ u8 *ptr;
+
+ memset(sd, 0, sizeof(*sd));
+
+ sd->segment = domain;
+ sd->node = -1; /* Revisit for NUMA */
+ sd->platform_data = pdev;
+
+ /* Look for resources for this controller in xenbus. */
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "root_num",
+ "%d", &root_num);
+ if (err != 1)
+ return;
+
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1)))
+ return;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+ str, "%x:%x", &d, &b);
+ if (err != 2)
+ return;
+
+ if (d == domain && b == bus)
+ break;
+ }
+
+ if (i == root_num)
+ return;
+
+ len = snprintf(str, sizeof(str), "root-resource-magic");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+ str, "%lx", &magic);
+
+ if (err != 1)
+ return; /* No resources, nothing to do */
+
+ if (magic != (sizeof(res) * 2) + 1) {
+ printk(KERN_WARNING "pcifront: resource magic mismatch\n");
+ return;
+ }
+
+ len = snprintf(str, sizeof(str), "root-%d-resources", i);
+ if (unlikely(len >= (sizeof(str) - 1)))
+ return;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+ str, "%d", &res_count);
+
+ if (err != 1)
+ return; /* No resources, nothing to do */
+
+ sd->window = kzalloc(sizeof(*sd->window) * res_count, GFP_KERNEL);
+ if (!sd->window)
+ return;
+
+ /* magic is also the size of the byte stream in xenbus */
+ buf = kmalloc(magic, GFP_KERNEL);
+ if (!buf) {
+ kfree(sd->window);
+ sd->window = NULL;
+ return;
+ }
+
+ /* Read the resources out of xenbus */
+ for (j = 0; j < res_count; j++) {
+ memset(&res, 0, sizeof(res));
+ memset(buf, 0, magic);
+
+ len = snprintf(str, sizeof(str), "root-%d-resource-%d", i, j);
+ if (unlikely(len >= (sizeof(str) - 1)))
+ return;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+ "%s", buf);
+ if (err != 1) {
+ printk(KERN_WARNING "pcifront: error reading "
+ "resource %d on bus %04x:%02x\n",
+ j, domain, bus);
+ continue;
+ }
+
+ bufp = buf;
+ ptr = (u8 *)&res;
+ memset(tmp, 0, sizeof(tmp));
+
+ /* Copy ASCII byte stream into structure */
+ for (k = 0; k < magic - 1; k += 2) {
+ memcpy(tmp, bufp, 2);
+ bufp += 2;
+
+ sscanf(tmp, "%02x", &byte);
+ *ptr = byte;
+ ptr++;
+ }
+
+ xen_add_resource(sd, domain, bus, &res);
+ sd->windows++;
+ }
+ kfree(buf);
+}
+#endif
+
+static int errno_to_pcibios_err(int errno)
+{
+ switch (errno) {
+ case XEN_PCI_ERR_success:
+ return PCIBIOS_SUCCESSFUL;
+
+ case XEN_PCI_ERR_dev_not_found:
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ case XEN_PCI_ERR_invalid_offset:
+ case XEN_PCI_ERR_op_failed:
+ return PCIBIOS_BAD_REGISTER_NUMBER;
+
+ case XEN_PCI_ERR_not_implemented:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+
+ case XEN_PCI_ERR_access_denied:
+ return PCIBIOS_SET_FAILED;
+ }
+ return errno;
+}
+
+static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
+{
+ int err = 0;
+ struct xen_pci_op *active_op = &pdev->sh_info->op;
+ unsigned long irq_flags;
+ evtchn_port_t port = pdev->evtchn;
+ s64 ns, ns_timeout;
+ struct timeval tv;
+
+ spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
+
+ memcpy(active_op, op, sizeof(struct xen_pci_op));
+
+ /* Go */
+ wmb();
+ set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+ notify_remote_via_evtchn(port);
+
+ /*
+ * We set a poll timeout of 3 seconds but give up on return after
+ * 2 seconds. It is better to time out too late rather than too early
+ * (in the latter case we end up continually re-executing poll() with a
+ * timeout in the past). 1s difference gives plenty of slack for error.
+ */
+ do_gettimeofday(&tv);
+ ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+
+ clear_evtchn(port);
+
+ while (test_bit(_XEN_PCIF_active,
+ (unsigned long *)&pdev->sh_info->flags)) {
+ if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
+ BUG();
+ clear_evtchn(port);
+ do_gettimeofday(&tv);
+ ns = timeval_to_ns(&tv);
+ if (ns > ns_timeout) {
+ dev_err(&pdev->xdev->dev,
+ "pciback not responding!!!\n");
+ clear_bit(_XEN_PCIF_active,
+ (unsigned long *)&pdev->sh_info->flags);
+ err = XEN_PCI_ERR_dev_not_found;
+ goto out;
+ }
+ }
+
+ memcpy(op, active_op, sizeof(struct xen_pci_op));
+
+ err = op->err;
+ out:
+ spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
+ return err;
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 * val)
+{
+ int err = 0;
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_conf_read,
+ .domain = pci_domain_nr(bus),
+ .bus = bus->number,
+ .devfn = devfn,
+ .offset = where,
+ .size = size,
+ };
+ struct pcifront_sd *sd = bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+ if (verbose_request)
+ dev_info(&pdev->xdev->dev,
+ "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
+ pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
+ PCI_FUNC(devfn), where, size);
+
+ err = do_pci_op(pdev, &op);
+
+ if (likely(!err)) {
+ if (verbose_request)
+ dev_info(&pdev->xdev->dev, "read got back value %x\n",
+ op.value);
+
+ *val = op.value;
+ } else if (err == -ENODEV) {
+ /* No device here, pretend that it just returned 0 */
+ err = 0;
+ *val = 0;
+ }
+
+ return errno_to_pcibios_err(err);
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct xen_pci_op op = {
+ .cmd = XEN_PCI_OP_conf_write,
+ .domain = pci_domain_nr(bus),
+ .bus = bus->number,
+ .devfn = devfn,
+ .offset = where,
+ .size = size,
+ .value = val,
+ };
+ struct pcifront_sd *sd = bus->sysdata;
+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+ if (verbose_request)
+ dev_info(&pdev->xdev->dev,
+ "write dev=%04x:%02x:%02x.%01x - "
+ "offset %x size %d val %x\n",
+ pci_domain_nr(bus), bus->number,
+ PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
+
+ return errno_to_pcibios_err(do_pci_op(pdev, &op));
+}
+
+struct pci_ops pcifront_bus_ops = {
+ .read = pcifront_bus_read,
+ .write = pcifront_bus_write,
+};
+
+/* Claim resources for the PCI frontend as-is, backend won't allow changes */
+static void pcifront_claim_resource(struct pci_dev *dev, void *data)
+{
+ struct pcifront_device *pdev = data;
+ int i;
+ struct resource *r;
+
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+ r = &dev->resource[i];
+
+ if (!r->parent && r->start && r->flags) {
+ dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
+ pci_name(dev), i);
+ pci_claim_resource(dev, i);
+ }
+ }
+}
+
+int __devinit pcifront_scan_root(struct pcifront_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ struct pci_bus *b;
+ struct pcifront_sd *sd = NULL;
+ struct pci_bus_entry *bus_entry = NULL;
+ int err = 0;
+
+#ifndef CONFIG_PCI_DOMAINS
+ if (domain != 0) {
+ dev_err(&pdev->xdev->dev,
+ "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+ dev_err(&pdev->xdev->dev,
+ "Please compile with CONFIG_PCI_DOMAINS\n");
+ err = -EINVAL;
+ goto err_out;
+ }
+#endif
+
+ dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
+ domain, bus);
+
+ bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
+ sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+ if (!bus_entry || !sd) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ pcifront_init_sd(sd, domain, bus, pdev);
+
+ b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
+ &pcifront_bus_ops, sd);
+ if (!b) {
+ dev_err(&pdev->xdev->dev,
+ "Error creating PCI Frontend Bus!\n");
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ pcifront_setup_root_resources(b, sd);
+ bus_entry->bus = b;
+
+ list_add(&bus_entry->list, &pdev->root_buses);
+
+ /* Claim resources before going "live" with our devices */
+ pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+ pci_bus_add_devices(b);
+
+ return 0;
+
+ err_out:
+ kfree(bus_entry);
+ kfree(sd);
+
+ return err;
+}
+
+int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ struct pci_bus *b;
+ struct pci_dev *d;
+ unsigned int devfn;
+
+#ifndef CONFIG_PCI_DOMAINS
+ if (domain != 0) {
+ dev_err(&pdev->xdev->dev,
+ "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+ dev_err(&pdev->xdev->dev,
+ "Please compile with CONFIG_PCI_DOMAINS\n");
+ return -EINVAL;
+ }
+#endif
+
+ dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
+ domain, bus);
+
+ b = pci_find_bus(domain, bus);
+ if(!b)
+ /* If the bus is unknown, create it. */
+ return pcifront_scan_root(pdev, domain, bus);
+
+ /* Rescan the bus for newly attached functions and add.
+ * We omit handling of PCI bridge attachment because pciback prevents
+ * bridges from being exported.
+ */
+ for (devfn = 0; devfn < 0x100; devfn++) {
+ d = pci_get_slot(b, devfn);
+ if(d) {
+ /* Device is already known. */
+ pci_dev_put(d);
+ continue;
+ }
+
+ d = pci_scan_single_device(b, devfn);
+ if (d) {
+ int err;
+
+ dev_info(&pdev->xdev->dev, "New device on "
+ "%04x:%02x:%02x.%02x found.\n", domain, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+ err = pci_bus_add_device(d);
+ if (err)
+ dev_err(&pdev->xdev->dev,
+ "error %d adding device, continuing.\n",
+ err);
+ }
+ }
+
+ return 0;
+}
+
+static void free_root_bus_devs(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ while (!list_empty(&bus->devices)) {
+ dev = container_of(bus->devices.next, struct pci_dev,
+ bus_list);
+ dev_dbg(&dev->dev, "removing device\n");
+ pci_remove_bus_device(dev);
+ }
+}
+
+void pcifront_free_roots(struct pcifront_device *pdev)
+{
+ struct pci_bus_entry *bus_entry, *t;
+
+ dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
+
+ list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
+ list_del(&bus_entry->list);
+
+ free_root_bus_devs(bus_entry->bus);
+
+ kfree(bus_entry->bus->sysdata);
+
+ device_unregister(bus_entry->bus->bridge);
+ pci_remove_bus(bus_entry->bus);
+
+ kfree(bus_entry);
+ }
+}
--- /dev/null
+/*
+ * PCI Frontend - Common data structures & function declarations
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIFRONT_H__
+#define __XEN_PCIFRONT_H__
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/pciif.h>
+#include <xen/pcifront.h>
+
+struct pci_bus_entry {
+ struct list_head list;
+ struct pci_bus *bus;
+};
+
+struct pcifront_device {
+ struct xenbus_device *xdev;
+ struct list_head root_buses;
+ spinlock_t dev_lock;
+
+ int evtchn;
+ int gnt_ref;
+
+ /* Lock this when doing any operations in sh_info */
+ spinlock_t sh_info_lock;
+ struct xen_pci_sharedinfo *sh_info;
+};
+
+int pcifront_connect(struct pcifront_device *pdev);
+void pcifront_disconnect(struct pcifront_device *pdev);
+
+int pcifront_scan_root(struct pcifront_device *pdev,
+ unsigned int domain, unsigned int bus);
+int pcifront_rescan_root(struct pcifront_device *pdev,
+ unsigned int domain, unsigned int bus);
+void pcifront_free_roots(struct pcifront_device *pdev);
+
+#endif /* __XEN_PCIFRONT_H__ */
--- /dev/null
+/*
+ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+#include "pcifront.h"
+
+#define INVALID_GRANT_REF (0)
+#define INVALID_EVTCHN (-1)
+
+static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
+{
+ struct pcifront_device *pdev;
+
+ pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
+ if (pdev == NULL)
+ goto out;
+
+ pdev->sh_info =
+ (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
+ if (pdev->sh_info == NULL) {
+ kfree(pdev);
+ pdev = NULL;
+ goto out;
+ }
+ pdev->sh_info->flags = 0;
+
+ xdev->dev.driver_data = pdev;
+ pdev->xdev = xdev;
+
+ INIT_LIST_HEAD(&pdev->root_buses);
+
+ spin_lock_init(&pdev->dev_lock);
+ spin_lock_init(&pdev->sh_info_lock);
+
+ pdev->evtchn = INVALID_EVTCHN;
+ pdev->gnt_ref = INVALID_GRANT_REF;
+
+ dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
+ pdev, pdev->sh_info);
+ out:
+ return pdev;
+}
+
+static void free_pdev(struct pcifront_device *pdev)
+{
+ dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
+
+ pcifront_free_roots(pdev);
+
+ if (pdev->evtchn != INVALID_EVTCHN)
+ xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
+
+ if (pdev->gnt_ref != INVALID_GRANT_REF)
+ gnttab_end_foreign_access(pdev->gnt_ref,
+ (unsigned long)pdev->sh_info);
+
+ pdev->xdev->dev.driver_data = NULL;
+
+ kfree(pdev);
+}
+
+static int pcifront_publish_info(struct pcifront_device *pdev)
+{
+ int err = 0;
+ struct xenbus_transaction trans;
+
+ err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
+ if (err < 0)
+ goto out;
+
+ pdev->gnt_ref = err;
+
+ err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
+ if (err)
+ goto out;
+
+ do_publish:
+ err = xenbus_transaction_start(&trans);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error writing configuration for backend "
+ "(start transaction)");
+ goto out;
+ }
+
+ err = xenbus_printf(trans, pdev->xdev->nodename,
+ "pci-op-ref", "%u", pdev->gnt_ref);
+ if (!err)
+ err = xenbus_printf(trans, pdev->xdev->nodename,
+ "event-channel", "%u", pdev->evtchn);
+ if (!err)
+ err = xenbus_printf(trans, pdev->xdev->nodename,
+ "magic", XEN_PCI_MAGIC);
+
+ if (err) {
+ xenbus_transaction_end(trans, 1);
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error writing configuration for backend");
+ goto out;
+ } else {
+ err = xenbus_transaction_end(trans, 0);
+ if (err == -EAGAIN)
+ goto do_publish;
+ else if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error completing transaction "
+ "for backend");
+ goto out;
+ }
+ }
+
+ xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+
+ dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
+
+ out:
+ return err;
+}
+
+static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
+{
+ int err = -EFAULT;
+ int i, num_roots, len;
+ char str[64];
+ unsigned int domain, bus;
+
+ spin_lock(&pdev->dev_lock);
+
+ /* Only connect once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitialised)
+ goto out;
+
+ err = pcifront_connect(pdev);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error connecting PCI Frontend");
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+ "root_num", "%d", &num_roots);
+ if (err == -ENOENT) {
+ xenbus_dev_error(pdev->xdev, err,
+ "No PCI Roots found, trying 0000:00");
+ err = pcifront_scan_root(pdev, 0, 0);
+ num_roots = 0;
+ } else if (err != 1) {
+ if (err == 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of PCI roots");
+ goto out;
+ }
+
+ for (i = 0; i < num_roots; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+ "%x:%x", &domain, &bus);
+ if (err != 2) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading PCI root %d", i);
+ goto out;
+ }
+
+ err = pcifront_scan_root(pdev, domain, bus);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error scanning PCI root %04x:%02x",
+ domain, bus);
+ goto out;
+ }
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+ if (err)
+ goto out;
+
+ out:
+ spin_unlock(&pdev->dev_lock);
+ return err;
+}
+
+static int pcifront_try_disconnect(struct pcifront_device *pdev)
+{
+ int err = 0;
+ enum xenbus_state prev_state;
+
+ spin_lock(&pdev->dev_lock);
+
+ prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
+
+ if (prev_state < XenbusStateClosing)
+ err = xenbus_switch_state(pdev->xdev, XenbusStateClosing);
+
+ if (!err && prev_state == XenbusStateConnected)
+ pcifront_disconnect(pdev);
+
+ spin_unlock(&pdev->dev_lock);
+
+ return err;
+}
+
+static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
+{
+ int err = -EFAULT;
+ int i, num_roots, len;
+ unsigned int domain, bus;
+ char str[64];
+
+ spin_lock(&pdev->dev_lock);
+
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateReconfiguring)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+ "root_num", "%d", &num_roots);
+ if (err == -ENOENT) {
+ xenbus_dev_error(pdev->xdev, err,
+ "No PCI Roots found, trying 0000:00");
+ err = pcifront_rescan_root(pdev, 0, 0);
+ num_roots = 0;
+ } else if (err != 1) {
+ if (err == 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of PCI roots");
+ goto out;
+ }
+
+ for (i = 0; i < num_roots; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+ "%x:%x", &domain, &bus);
+ if (err != 2) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading PCI root %d", i);
+ goto out;
+ }
+
+ err = pcifront_rescan_root(pdev, domain, bus);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error scanning PCI root %04x:%02x",
+ domain, bus);
+ goto out;
+ }
+ }
+
+ xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+
+ out:
+ spin_unlock(&pdev->dev_lock);
+ return err;
+}
+
+static int pcifront_detach_devices(struct pcifront_device *pdev)
+{
+ int err = 0;
+ int i, num_devs;
+ unsigned int domain, bus, slot, func;
+ struct pci_bus *pci_bus;
+ struct pci_dev *pci_dev;
+ char str[64];
+
+ spin_lock(&pdev->dev_lock);
+
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateConnected)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of PCI devices");
+ goto out;
+ }
+
+ /* Find devices being detached and remove them. */
+ for (i = 0; i < num_devs; i++) {
+ int l, state;
+ l = snprintf(str, sizeof(str), "state-%d", i);
+ if (unlikely(l >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
+ &state);
+ if (err != 1)
+ state = XenbusStateUnknown;
+
+ if (state != XenbusStateClosing)
+ continue;
+
+ /* Remove device. */
+ l = snprintf(str, sizeof(str), "vdev-%d", i);
+ if (unlikely(l >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+ if (err != 4) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading PCI device %d", i);
+ goto out;
+ }
+
+ pci_bus = pci_find_bus(domain, bus);
+ if(!pci_bus) {
+ dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
+ domain, bus);
+ continue;
+ }
+ pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
+ if(!pci_dev) {
+ dev_dbg(&pdev->xdev->dev,
+ "Cannot get PCI device %04x:%02x:%02x.%02x\n",
+ domain, bus, slot, func);
+ continue;
+ }
+ pci_remove_bus_device(pci_dev);
+ pci_dev_put(pci_dev);
+
+ dev_dbg(&pdev->xdev->dev,
+ "PCI device %04x:%02x:%02x.%02x removed.\n",
+ domain, bus, slot, func);
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
+
+ out:
+ spin_unlock(&pdev->dev_lock);
+ return err;
+}
+
+static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
+ enum xenbus_state be_state)
+{
+ struct pcifront_device *pdev = xdev->dev.driver_data;
+
+ switch (be_state) {
+ case XenbusStateClosing:
+ dev_warn(&xdev->dev, "backend going away!\n");
+ pcifront_try_disconnect(pdev);
+ break;
+
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ dev_warn(&xdev->dev, "backend went away!\n");
+ pcifront_try_disconnect(pdev);
+
+ device_unregister(&pdev->xdev->dev);
+ break;
+
+ case XenbusStateConnected:
+ pcifront_try_connect(pdev);
+ break;
+
+ case XenbusStateReconfiguring:
+ pcifront_detach_devices(pdev);
+ break;
+
+ case XenbusStateReconfigured:
+ pcifront_attach_devices(pdev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int pcifront_xenbus_probe(struct xenbus_device *xdev,
+ const struct xenbus_device_id *id)
+{
+ int err = 0;
+ struct pcifront_device *pdev = alloc_pdev(xdev);
+
+ if (pdev == NULL) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(xdev, err,
+ "Error allocating pcifront_device struct");
+ goto out;
+ }
+
+ err = pcifront_publish_info(pdev);
+
+ out:
+ return err;
+}
+
+static int pcifront_xenbus_remove(struct xenbus_device *xdev)
+{
+ if (xdev->dev.driver_data)
+ free_pdev(xdev->dev.driver_data);
+
+ return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+ {"pci"},
+ {{0}},
+};
+MODULE_ALIAS("xen:pci");
+
+static struct xenbus_driver xenbus_pcifront_driver = {
+ .name = "pcifront",
+ .ids = xenpci_ids,
+ .probe = pcifront_xenbus_probe,
+ .remove = pcifront_xenbus_remove,
+ .otherend_changed = pcifront_backend_changed,
+};
+
+static int __init pcifront_init(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ return xenbus_register_frontend(&xenbus_pcifront_driver);
+}
+
+/* Initialize after the Xen PCI Frontend Stub is initialized */
+subsys_initcall(pcifront_init);
--- /dev/null
+
+obj-y += privcmd.o
+obj-$(CONFIG_COMPAT) += compat_privcmd.o
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (C) IBM Corp. 2006
+ *
+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
+ */
+
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+#include <linux/syscalls.h>
+#include <asm/hypervisor.h>
+#include <asm/uaccess.h>
+#include <xen/public/privcmd.h>
+#include <xen/compat_ioctl.h>
+
+int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_MMAP_32: {
+ struct privcmd_mmap *p;
+ struct privcmd_mmap_32 *p32;
+ struct privcmd_mmap_32 n32;
+
+ p32 = compat_ptr(arg);
+ p = compat_alloc_user_space(sizeof(*p));
+ if (copy_from_user(&n32, p32, sizeof(n32)) ||
+ put_user(n32.num, &p->num) ||
+ put_user(n32.dom, &p->dom) ||
+ put_user(compat_ptr(n32.entry), &p->entry))
+ return -EFAULT;
+
+ ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAP, (unsigned long)p);
+ }
+ break;
+ case IOCTL_PRIVCMD_MMAPBATCH_32: {
+ struct privcmd_mmapbatch *p;
+ struct privcmd_mmapbatch_32 *p32;
+ struct privcmd_mmapbatch_32 n32;
+
+ p32 = compat_ptr(arg);
+ p = compat_alloc_user_space(sizeof(*p));
+ if (copy_from_user(&n32, p32, sizeof(n32)) ||
+ put_user(n32.num, &p->num) ||
+ put_user(n32.dom, &p->dom) ||
+ put_user(n32.addr, &p->addr) ||
+ put_user(compat_ptr(n32.arr), &p->arr))
+ return -EFAULT;
+
+ ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH, (unsigned long)p);
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
--- /dev/null
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <asm/hypervisor.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/hypervisor.h>
+#include <xen/public/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/xen_proc.h>
+#include <xen/features.h>
+
+static struct proc_dir_entry *privcmd_intf;
+static struct proc_dir_entry *capabilities_intf;
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl(struct file *file,
+ unsigned int cmd, unsigned long data)
+{
+ int ret = -ENOSYS;
+ void __user *udata = (void __user *) data;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_HYPERCALL: {
+ privcmd_hypercall_t hypercall;
+
+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+ return -EFAULT;
+
+#if defined(__i386__)
+ if (hypercall.op >= (PAGE_SIZE >> 5))
+ break;
+ __asm__ __volatile__ (
+ "pushl %%ebx; pushl %%ecx; pushl %%edx; "
+ "pushl %%esi; pushl %%edi; "
+ "movl 8(%%eax),%%ebx ;"
+ "movl 16(%%eax),%%ecx ;"
+ "movl 24(%%eax),%%edx ;"
+ "movl 32(%%eax),%%esi ;"
+ "movl 40(%%eax),%%edi ;"
+ "movl (%%eax),%%eax ;"
+ "shll $5,%%eax ;"
+ "addl $hypercall_page,%%eax ;"
+ "call *%%eax ;"
+ "popl %%edi; popl %%esi; popl %%edx; "
+ "popl %%ecx; popl %%ebx"
+ : "=a" (ret) : "0" (&hypercall) : "memory" );
+#elif defined (__x86_64__)
+ if (hypercall.op < (PAGE_SIZE >> 5)) {
+ long ign1, ign2, ign3;
+ __asm__ __volatile__ (
+ "movq %8,%%r10; movq %9,%%r8;"
+ "shll $5,%%eax ;"
+ "addq $hypercall_page,%%rax ;"
+ "call *%%rax"
+ : "=a" (ret), "=D" (ign1),
+ "=S" (ign2), "=d" (ign3)
+ : "0" ((unsigned int)hypercall.op),
+ "1" (hypercall.arg[0]),
+ "2" (hypercall.arg[1]),
+ "3" (hypercall.arg[2]),
+ "g" (hypercall.arg[3]),
+ "g" (hypercall.arg[4])
+ : "r8", "r10", "memory" );
+ }
+#else
+ ret = privcmd_hypercall(&hypercall);
+#endif
+ }
+ break;
+
+ case IOCTL_PRIVCMD_MMAP: {
+ privcmd_mmap_t mmapcmd;
+ privcmd_mmap_entry_t msg;
+ privcmd_mmap_entry_t __user *p;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long va;
+ int i, rc;
+
+ if (!is_initial_xendomain())
+ return -EPERM;
+
+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+ return -EFAULT;
+
+ p = mmapcmd.entry;
+ if (copy_from_user(&msg, p, sizeof(msg)))
+ return -EFAULT;
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, msg.va);
+ rc = -EINVAL;
+ if (!vma || (msg.va != vma->vm_start) ||
+ !privcmd_enforce_singleshot_mapping(vma))
+ goto mmap_out;
+
+ va = vma->vm_start;
+
+ for (i = 0; i < mmapcmd.num; i++) {
+ rc = -EFAULT;
+ if (copy_from_user(&msg, p, sizeof(msg)))
+ goto mmap_out;
+
+ /* Do not allow range to wrap the address space. */
+ rc = -EINVAL;
+ if ((msg.npages > (LONG_MAX >> PAGE_SHIFT)) ||
+ ((unsigned long)(msg.npages << PAGE_SHIFT) >= -va))
+ goto mmap_out;
+
+ /* Range chunks must be contiguous in va space. */
+ if ((msg.va != va) ||
+ ((msg.va+(msg.npages<<PAGE_SHIFT)) > vma->vm_end))
+ goto mmap_out;
+
+ if ((rc = direct_remap_pfn_range(
+ vma,
+ msg.va & PAGE_MASK,
+ msg.mfn,
+ msg.npages << PAGE_SHIFT,
+ vma->vm_page_prot,
+ mmapcmd.dom)) < 0)
+ goto mmap_out;
+
+ p++;
+ va += msg.npages << PAGE_SHIFT;
+ }
+
+ rc = 0;
+
+ mmap_out:
+ up_write(&mm->mmap_sem);
+ ret = rc;
+ }
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH: {
+ privcmd_mmapbatch_t m;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ xen_pfn_t __user *p;
+ unsigned long addr, mfn, nr_pages;
+ int i;
+
+ if (!is_initial_xendomain())
+ return -EPERM;
+
+ if (copy_from_user(&m, udata, sizeof(m)))
+ return -EFAULT;
+
+ nr_pages = m.num;
+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, m.addr);
+ if (!vma ||
+ (m.addr != vma->vm_start) ||
+ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+ !privcmd_enforce_singleshot_mapping(vma)) {
+ up_write(&mm->mmap_sem);
+ return -EINVAL;
+ }
+
+ p = m.arr;
+ addr = m.addr;
+ for (i = 0; i < nr_pages; i++, addr += PAGE_SIZE, p++) {
+ if (get_user(mfn, p)) {
+ up_write(&mm->mmap_sem);
+ return -EFAULT;
+ }
+
+ ret = direct_remap_pfn_range(vma, addr & PAGE_MASK,
+ mfn, PAGE_SIZE,
+ vma->vm_page_prot, m.dom);
+ if (ret < 0)
+ put_user(0xF0000000 | mfn, p);
+ }
+
+ up_write(&mm->mmap_sem);
+ ret = 0;
+ }
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+ .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
+{
+#ifndef __powerpc__ /* PowerPC has a trick to safely do this. */
+ /* Unsupported for auto-translate guests. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -ENOSYS;
+#endif
+
+ /* DONTCOPY is essential for Xen as copy_page_range is broken. */
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
+ vma->vm_ops = &privcmd_vm_ops;
+ vma->vm_private_data = NULL;
+
+ return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+static const struct file_operations privcmd_file_ops = {
+ .unlocked_ioctl = privcmd_ioctl,
+ .mmap = privcmd_mmap,
+};
+
+static int capabilities_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len = 0;
+ *page = 0;
+
+ if (is_initial_xendomain())
+ len = sprintf( page, "control_d\n" );
+
+ *eof = 1;
+ return len;
+}
+
+static int __init privcmd_init(void)
+{
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ privcmd_intf = create_xen_proc_entry("privcmd", 0400);
+ if (privcmd_intf != NULL)
+ privcmd_intf->proc_fops = &privcmd_file_ops;
+
+ capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
+ if (capabilities_intf != NULL)
+ capabilities_intf->read_proc = capabilities_read;
+
+ return 0;
+}
+
+__initcall(privcmd_init);
--- /dev/null
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netback -Idrivers/xen/sfc_netutil -Idrivers/xen/netback -Idrivers/net/sfc
+EXTRA_CFLAGS += -D__ci_driver__
+EXTRA_CFLAGS += -DEFX_USE_KCOMPAT
+EXTRA_CFLAGS += -Werror
+
+ifdef GCOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) := sfc_netback.o
+
+sfc_netback-objs := accel.o accel_fwd.o accel_msg.o accel_solarflare.o accel_xenbus.o accel_debugfs.o
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_solarflare.h"
+
+#include <linux/notifier.h>
+
+#ifdef EFX_GCOV
+#include "gcov.h"
+#endif
+
+static int netback_accel_netdev_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *net_dev = (struct net_device *)ptr;
+ struct netback_accel *bend;
+
+ if ((event == NETDEV_UP) || (event == NETDEV_DOWN)) {
+ mutex_lock(&bend_list_mutex);
+ bend = bend_list;
+ while (bend != NULL) {
+ mutex_lock(&bend->bend_mutex);
+ /*
+ * This happens when the shared pages have
+ * been unmapped, but the bend not yet removed
+ * from list
+ */
+ if (bend->shared_page == NULL)
+ goto next;
+
+ if (bend->net_dev->ifindex == net_dev->ifindex)
+ netback_accel_set_interface_state
+ (bend, event == NETDEV_UP);
+
+ next:
+ mutex_unlock(&bend->bend_mutex);
+ bend = bend->next_bend;
+ }
+ mutex_unlock(&bend_list_mutex);
+ }
+
+ return NOTIFY_DONE;
+}
+
+
+static struct notifier_block netback_accel_netdev_notifier = {
+ .notifier_call = netback_accel_netdev_event,
+};
+
+
+unsigned sfc_netback_max_pages = NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES;
+module_param_named(max_pages, sfc_netback_max_pages, uint, 0644);
+MODULE_PARM_DESC(max_pages,
+ "The number of buffer pages to enforce on each guest");
+
+/* Initialise subsystems need for the accelerated fast path */
+static int __init netback_accel_init(void)
+{
+ int rc = 0;
+
+#ifdef EFX_GCOV
+ gcov_provider_init(THIS_MODULE);
+#endif
+
+ rc = netback_accel_init_fwd();
+
+ if (rc == 0)
+ netback_accel_debugfs_init();
+
+ if (rc == 0)
+ rc = netback_accel_sf_init();
+
+ if (rc == 0)
+ rc = register_netdevice_notifier
+ (&netback_accel_netdev_notifier);
+
+ /*
+ * What if no device was found, shouldn't we clean up stuff
+ * we've allocated for acceleration subsystem?
+ */
+
+ return rc;
+}
+
+module_init(netback_accel_init);
+
+static void __exit netback_accel_exit(void)
+{
+ unregister_netdevice_notifier(&netback_accel_netdev_notifier);
+
+ netback_accel_sf_shutdown();
+
+ netback_accel_shutdown_bends();
+
+ netback_accel_debugfs_fini();
+
+ netback_accel_shutdown_fwd();
+
+#ifdef EFX_GCOV
+ gcov_provider_fini(THIS_MODULE);
+#endif
+}
+
+module_exit(netback_accel_exit);
+
+MODULE_LICENSE("GPL");
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_H
+#define NETBACK_ACCEL_H
+
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+
+#include <xen/xenbus.h>
+
+#include "accel_shared_fifo.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+
+/**************************************************************************
+ * Datatypes
+ **************************************************************************/
+
+#define NETBACK_ACCEL_DEFAULT_MAX_FILTERS (8)
+#define NETBACK_ACCEL_DEFAULT_MAX_MCASTS (8)
+#define NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES (384)
+/* Variable to store module parameter for max_buf_pages */
+extern unsigned sfc_netback_max_pages;
+
+#define NETBACK_ACCEL_STATS 1
+
+#if NETBACK_ACCEL_STATS
+#define NETBACK_ACCEL_STATS_OP(x) x
+#else
+#define NETBACK_ACCEL_STATS_OP(x)
+#endif
+
+/*! Statistics for a given backend */
+struct netback_accel_stats {
+ /*! Number of eventq wakeup events */
+ u64 evq_wakeups;
+ /*! Number of eventq timeout events */
+ u64 evq_timeouts;
+ /*! Number of filters used */
+ u32 num_filters;
+ /*! Number of buffer pages registered */
+ u32 num_buffer_pages;
+};
+
+
+/* Debug fs nodes for each of the above stats */
+struct netback_accel_dbfs {
+ struct dentry *evq_wakeups;
+ struct dentry *evq_timeouts;
+ struct dentry *num_filters;
+ struct dentry *num_buffer_pages;
+};
+
+
+/*! Resource limits for a given NIC */
+struct netback_accel_limits {
+ int max_filters; /*!< Max. number of filters to use. */
+ int max_mcasts; /*!< Max. number of mcast subscriptions */
+ int max_buf_pages; /*!< Max. number of pages of NIC buffers */
+};
+
+
+/*! The state for an instance of the back end driver. */
+struct netback_accel {
+ /*! mutex to protect this state */
+ struct mutex bend_mutex;
+
+ /*! Watches on xenstore */
+ struct xenbus_watch domu_accel_watch;
+ struct xenbus_watch config_accel_watch;
+
+ /*! Pointer to whatever device cookie ties us in to the hypervisor */
+ void *hdev_data;
+
+ /*! FIFO indices. Next page is msg FIFOs */
+ struct net_accel_shared_page *shared_page;
+
+ /*! Defer control message processing */
+ struct work_struct handle_msg;
+
+ /*! Identifies other end VM and interface.*/
+ int far_end;
+ int vif_num;
+
+ /*!< To unmap the shared pages */
+ void *sh_pages_unmap;
+
+ /* Resource tracking */
+ /*! Limits on H/W & Dom0 resources */
+ struct netback_accel_limits quotas;
+
+ /* Hardware resources */
+ /*! The H/W type of associated NIC */
+ enum net_accel_hw_type hw_type;
+ /*! State of allocation */
+ int hw_state;
+ /*! Index into ci_driver.nics[] for this interface */
+ int nic_index;
+ /*! How to set up the acceleration for this hardware */
+ int (*accel_setup)(struct netback_accel *);
+ /*! And how to stop it. */
+ void (*accel_shutdown)(struct netback_accel *);
+
+ /*! The physical/real net_dev for this interface */
+ struct net_device *net_dev;
+
+ /*! Magic pointer to locate state in fowarding table */
+ void *fwd_priv;
+
+ /*! Message FIFO */
+ sh_msg_fifo2 to_domU;
+ /*! Message FIFO */
+ sh_msg_fifo2 from_domU;
+
+ /*! General notification channel id */
+ int msg_channel;
+ /*! General notification channel irq */
+ int msg_channel_irq;
+
+ /*! Event channel id dedicated to network packet interrupts. */
+ int net_channel;
+ /*! Event channel irq dedicated to network packets interrupts */
+ int net_channel_irq;
+
+ /*! The MAC address the frontend goes by. */
+ u8 mac[ETH_ALEN];
+ /*! Driver name of associated NIC */
+ char *nicname;
+
+ /*! Array of pointers to buffer pages mapped */
+ grant_handle_t *buffer_maps;
+ u64 *buffer_addrs;
+ /*! Index into buffer_maps */
+ int buffer_maps_index;
+ /*! Max number of pages that domU is allowed/will request to map */
+ int max_pages;
+
+ /*! Pointer to hardware specific private area */
+ void *accel_hw_priv;
+
+ /*! Wait queue for changes in accelstate. */
+ wait_queue_head_t state_wait_queue;
+
+ /*! Current state of the frontend according to the xenbus
+ * watch. */
+ XenbusState frontend_state;
+
+ /*! Current state of this backend. */
+ XenbusState backend_state;
+
+ /*! Non-zero if the backend is being removed. */
+ int removing;
+
+ /*! Non-zero if the setup_vnic has been called. */
+ int vnic_is_setup;
+
+#if NETBACK_ACCEL_STATS
+ struct netback_accel_stats stats;
+#endif
+#if defined(CONFIG_DEBUG_FS)
+ char *dbfs_dir_name;
+ struct dentry *dbfs_dir;
+ struct netback_accel_dbfs dbfs;
+#endif
+
+ /*! List */
+ struct netback_accel *next_bend;
+};
+
+
+/*
+ * Values for netback_accel.hw_state. States of resource allocation
+ * we can go through
+ */
+/*! No hardware has yet been allocated. */
+#define NETBACK_ACCEL_RES_NONE (0)
+/*! Hardware has been allocated. */
+#define NETBACK_ACCEL_RES_ALLOC (1)
+#define NETBACK_ACCEL_RES_FILTER (2)
+#define NETBACK_ACCEL_RES_HWINFO (3)
+
+/*! Filtering specification. This assumes that for VNIC support we
+ * will always want wildcard entries, so only specifies the
+ * destination IP/port
+ */
+struct netback_accel_filter_spec {
+ /*! Internal, used to access efx_vi API */
+ void *filter_handle;
+
+ /*! Destination IP in network order */
+ u32 destip_be;
+ /*! Destination port in network order */
+ u16 destport_be;
+ /*! Mac address */
+ u8 mac[ETH_ALEN];
+ /*! TCP or UDP */
+ u8 proto;
+};
+
+
+/**************************************************************************
+ * From accel.c
+ **************************************************************************/
+
+/*! \brief Start up all the acceleration plugins
+ *
+ * \return 0 on success, an errno on failure
+ */
+extern int netback_accel_init_accel(void);
+
+/*! \brief Shut down all the acceleration plugins
+ */
+extern void netback_accel_shutdown_accel(void);
+
+
+/**************************************************************************
+ * From accel_fwd.c
+ **************************************************************************/
+
+/*! \brief Init the forwarding infrastructure
+ * \return 0 on success, or -ENOMEM if it couldn't get memory for the
+ * forward table
+ */
+extern int netback_accel_init_fwd(void);
+
+/*! \brief Shut down the forwarding and free memory. */
+extern void netback_accel_shutdown_fwd(void);
+
+/*! Initialise each nic port's fowarding table */
+extern void *netback_accel_init_fwd_port(void);
+extern void netback_accel_shutdown_fwd_port(void *fwd_priv);
+
+/*! \brief Add an entry to the forwarding table.
+ * \param mac : MAC address, used as hash key
+ * \param ctxt : value to associate with key (can be NULL, see
+ * netback_accel_fwd_set_context)
+ * \return 0 on success, -ENOMEM if table was full and could no grow it
+ */
+extern int netback_accel_fwd_add(const __u8 *mac, void *context,
+ void *fwd_priv);
+
+/*! \brief Remove an entry from the forwarding table.
+ * \param mac : the MAC address to remove
+ * \return nothing: it is not an error if the mac was not in the table
+ */
+extern void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv);
+
+/*! \brief Set the context pointer for an existing fwd table entry.
+ * \param mac : key that is already present in the table
+ * \param context : new value to associate with key
+ * \return 0 on success, -ENOENT if mac not present in table.
+ */
+extern int netback_accel_fwd_set_context(const __u8 *mac, void *context,
+ void *fwd_priv);
+
+/**************************************************************************
+ * From accel_msg.c
+ **************************************************************************/
+
+
+/*! \brief Send the start-of-day message that handshakes with the VNIC
+ * and tells it its MAC address.
+ *
+ * \param bend The back end driver data structure
+ * \param version The version of communication to use, e.g. NET_ACCEL_MSG_VERSION
+ */
+extern void netback_accel_msg_tx_hello(struct netback_accel *bend,
+ unsigned version);
+
+/*! \brief Send a "there's a new local mac address" message
+ *
+ * \param bend The back end driver data structure for the vnic to send
+ * the message to
+ * \param mac Pointer to the new mac address
+ */
+extern void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
+ const void *mac);
+
+/*! \brief Send a "a mac address that was local has gone away" message
+ *
+ * \param bend The back end driver data structure for the vnic to send
+ * the message to
+ * \param mac Pointer to the old mac address
+ */
+extern void netback_accel_msg_tx_old_localmac(struct netback_accel *bend,
+ const void *mac);
+
+extern void netback_accel_set_interface_state(struct netback_accel *bend,
+ int up);
+
+/*! \brief Process the message queue for a bend that has just
+ * interrupted.
+ *
+ * Demultiplexs an interrupt from the front end driver, taking
+ * messages from the fifo and taking appropriate action.
+ *
+ * \param bend The back end driver data structure
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+extern void netback_accel_msg_rx_handler(struct work_struct *arg);
+#else
+extern void netback_accel_msg_rx_handler(void *bend_void);
+#endif
+
+/**************************************************************************
+ * From accel_xenbus.c
+ **************************************************************************/
+/*! List of all the bends currently in existence. */
+extern struct netback_accel *bend_list;
+extern struct mutex bend_list_mutex;
+
+/*! \brief Probe a new network interface. */
+extern int netback_accel_probe(struct xenbus_device *dev);
+
+/*! \brief Remove a network interface. */
+extern int netback_accel_remove(struct xenbus_device *dev);
+
+/*! \brief Shutdown all accelerator backends */
+extern void netback_accel_shutdown_bends(void);
+
+/*! \brief Initiate the xenbus state teardown handshake */
+extern void netback_accel_set_closing(struct netback_accel *bend);
+
+/**************************************************************************
+ * From accel_debugfs.c
+ **************************************************************************/
+/*! Global statistics */
+struct netback_accel_global_stats {
+ /*! Number of TX packets seen through driverlink */
+ u64 dl_tx_packets;
+ /*! Number of TX packets seen through driverlink we didn't like */
+ u64 dl_tx_bad_packets;
+ /*! Number of RX packets seen through driverlink */
+ u64 dl_rx_packets;
+ /*! Number of mac addresses we are forwarding to */
+ u32 num_fwds;
+};
+
+/*! Debug fs entries for each of the above stats */
+struct netback_accel_global_dbfs {
+ struct dentry *dl_tx_packets;
+ struct dentry *dl_tx_bad_packets;
+ struct dentry *dl_rx_packets;
+ struct dentry *num_fwds;
+};
+
+#if NETBACK_ACCEL_STATS
+extern struct netback_accel_global_stats global_stats;
+#endif
+
+/*! \brief Initialise the debugfs root and populate with global stats */
+extern void netback_accel_debugfs_init(void);
+
+/*! \brief Remove our debugfs root directory */
+extern void netback_accel_debugfs_fini(void);
+
+/*! \brief Add per-bend statistics to debug fs */
+extern int netback_accel_debugfs_create(struct netback_accel *bend);
+/*! \brief Remove per-bend statistics from debug fs */
+extern int netback_accel_debugfs_remove(struct netback_accel *bend);
+
+#endif /* NETBACK_ACCEL_H */
+
+
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+
+#include "accel.h"
+
+#if defined(CONFIG_DEBUG_FS)
+static struct dentry *sfc_debugfs_root = NULL;
+#endif
+
+#if NETBACK_ACCEL_STATS
+struct netback_accel_global_stats global_stats;
+#if defined(CONFIG_DEBUG_FS)
+static struct netback_accel_global_dbfs global_dbfs;
+#endif
+#endif
+
+void netback_accel_debugfs_init(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+ sfc_debugfs_root = debugfs_create_dir("sfc_netback", NULL);
+ if (sfc_debugfs_root == NULL)
+ return;
+
+ global_dbfs.num_fwds = debugfs_create_u32
+ ("num_fwds", S_IRUSR | S_IRGRP | S_IROTH,
+ sfc_debugfs_root, &global_stats.num_fwds);
+ global_dbfs.dl_tx_packets = debugfs_create_u64
+ ("dl_tx_packets", S_IRUSR | S_IRGRP | S_IROTH,
+ sfc_debugfs_root, &global_stats.dl_tx_packets);
+ global_dbfs.dl_rx_packets = debugfs_create_u64
+ ("dl_rx_packets", S_IRUSR | S_IRGRP | S_IROTH,
+ sfc_debugfs_root, &global_stats.dl_rx_packets);
+ global_dbfs.dl_tx_bad_packets = debugfs_create_u64
+ ("dl_tx_bad_packets", S_IRUSR | S_IRGRP | S_IROTH,
+ sfc_debugfs_root, &global_stats.dl_tx_bad_packets);
+#endif
+}
+
+
+void netback_accel_debugfs_fini(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+ debugfs_remove(global_dbfs.num_fwds);
+ debugfs_remove(global_dbfs.dl_tx_packets);
+ debugfs_remove(global_dbfs.dl_rx_packets);
+ debugfs_remove(global_dbfs.dl_tx_bad_packets);
+
+ debugfs_remove(sfc_debugfs_root);
+#endif
+}
+
+
+int netback_accel_debugfs_create(struct netback_accel *bend)
+{
+#if defined(CONFIG_DEBUG_FS)
+ /* Smallest length is 7 (vif0.0\n) */
+ int length = 7, temp;
+
+ if (sfc_debugfs_root == NULL)
+ return -ENOENT;
+
+ /* Work out length of string representation of far_end and vif_num */
+ temp = bend->far_end;
+ while (temp > 9) {
+ length++;
+ temp = temp / 10;
+ }
+ temp = bend->vif_num;
+ while (temp > 9) {
+ length++;
+ temp = temp / 10;
+ }
+
+ bend->dbfs_dir_name = kmalloc(length, GFP_KERNEL);
+ if (bend->dbfs_dir_name == NULL)
+ return -ENOMEM;
+ sprintf(bend->dbfs_dir_name, "vif%d.%d", bend->far_end, bend->vif_num);
+
+ bend->dbfs_dir = debugfs_create_dir(bend->dbfs_dir_name,
+ sfc_debugfs_root);
+ if (bend->dbfs_dir == NULL) {
+ kfree(bend->dbfs_dir_name);
+ return -ENOMEM;
+ }
+
+#if NETBACK_ACCEL_STATS
+ bend->dbfs.evq_wakeups = debugfs_create_u64
+ ("evq_wakeups", S_IRUSR | S_IRGRP | S_IROTH,
+ bend->dbfs_dir, &bend->stats.evq_wakeups);
+ bend->dbfs.evq_timeouts = debugfs_create_u64
+ ("evq_timeouts", S_IRUSR | S_IRGRP | S_IROTH,
+ bend->dbfs_dir, &bend->stats.evq_timeouts);
+ bend->dbfs.num_filters = debugfs_create_u32
+ ("num_filters", S_IRUSR | S_IRGRP | S_IROTH,
+ bend->dbfs_dir, &bend->stats.num_filters);
+ bend->dbfs.num_buffer_pages = debugfs_create_u32
+ ("num_buffer_pages", S_IRUSR | S_IRGRP | S_IROTH,
+ bend->dbfs_dir, &bend->stats.num_buffer_pages);
+#endif
+#endif
+ return 0;
+}
+
+
+int netback_accel_debugfs_remove(struct netback_accel *bend)
+{
+#if defined(CONFIG_DEBUG_FS)
+ if (bend->dbfs_dir != NULL) {
+#if NETBACK_ACCEL_STATS
+ debugfs_remove(bend->dbfs.evq_wakeups);
+ debugfs_remove(bend->dbfs.evq_timeouts);
+ debugfs_remove(bend->dbfs.num_filters);
+ debugfs_remove(bend->dbfs.num_buffer_pages);
+#endif
+ debugfs_remove(bend->dbfs_dir);
+ }
+
+ if (bend->dbfs_dir_name)
+ kfree(bend->dbfs_dir_name);
+#endif
+ return 0;
+}
+
+
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include "accel.h"
+#include "accel_cuckoo_hash.h"
+#include "accel_util.h"
+#include "accel_solarflare.h"
+
+#include "driverlink_api.h"
+
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+
+/* State stored in the forward table */
+struct fwd_struct {
+ struct list_head link; /* Forms list */
+ void * context;
+ __u8 valid;
+ __u8 mac[ETH_ALEN];
+};
+
+/* Max value we support */
+#define NUM_FWDS_BITS 8
+#define NUM_FWDS (1 << NUM_FWDS_BITS)
+#define FWD_MASK (NUM_FWDS - 1)
+
+struct port_fwd {
+ /* Make a list */
+ struct list_head link;
+ /* Hash table to store the fwd_structs */
+ cuckoo_hash_table fwd_hash_table;
+ /* The array of fwd_structs */
+ struct fwd_struct *fwd_array;
+ /* Linked list of entries in use. */
+ struct list_head fwd_list;
+ /* Could do something clever with a reader/writer lock. */
+ spinlock_t fwd_lock;
+ /* Make find_free_entry() a bit faster by caching this */
+ int last_free_index;
+};
+
+/*
+ * This is unlocked as it's only called from dl probe and remove,
+ * which are themselves synchronised. Could get rid of it entirely as
+ * it's never iterated, but useful for debug
+ */
+static struct list_head port_fwds;
+
+
+/* Search the fwd_array for an unused entry */
+static int fwd_find_free_entry(struct port_fwd *fwd_set)
+{
+ int index = fwd_set->last_free_index;
+
+ do {
+ if (!fwd_set->fwd_array[index].valid) {
+ fwd_set->last_free_index = index;
+ return index;
+ }
+ index++;
+ if (index >= NUM_FWDS)
+ index = 0;
+ } while (index != fwd_set->last_free_index);
+
+ return -ENOMEM;
+}
+
+
+/* Look up a MAC in the hash table. Caller should hold table lock. */
+static inline struct fwd_struct *fwd_find_entry(const __u8 *mac,
+ struct port_fwd *fwd_set)
+{
+ cuckoo_hash_value value;
+ cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+
+ if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
+ (cuckoo_hash_key *)(&key),
+ &value)) {
+ struct fwd_struct *fwd = &fwd_set->fwd_array[value];
+ DPRINTK_ON(memcmp(fwd->mac, mac, ETH_ALEN) != 0);
+ return fwd;
+ }
+
+ return NULL;
+}
+
+
+/* Initialise each nic port's fowarding table */
+void *netback_accel_init_fwd_port(void)
+{
+ struct port_fwd *fwd_set;
+
+ fwd_set = kzalloc(sizeof(struct port_fwd), GFP_KERNEL);
+ if (fwd_set == NULL) {
+ return NULL;
+ }
+
+ spin_lock_init(&fwd_set->fwd_lock);
+
+ fwd_set->fwd_array = kzalloc(sizeof (struct fwd_struct) * NUM_FWDS,
+ GFP_KERNEL);
+ if (fwd_set->fwd_array == NULL) {
+ kfree(fwd_set);
+ return NULL;
+ }
+
+ if (cuckoo_hash_init(&fwd_set->fwd_hash_table, NUM_FWDS_BITS, 8) != 0) {
+ kfree(fwd_set->fwd_array);
+ kfree(fwd_set);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&fwd_set->fwd_list);
+
+ list_add(&fwd_set->link, &port_fwds);
+
+ return fwd_set;
+}
+
+
+void netback_accel_shutdown_fwd_port(void *fwd_priv)
+{
+ struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+ BUG_ON(fwd_priv == NULL);
+
+ BUG_ON(list_empty(&port_fwds));
+ list_del(&fwd_set->link);
+
+ BUG_ON(!list_empty(&fwd_set->fwd_list));
+
+ cuckoo_hash_destroy(&fwd_set->fwd_hash_table);
+ kfree(fwd_set->fwd_array);
+ kfree(fwd_set);
+}
+
+
+int netback_accel_init_fwd()
+{
+ INIT_LIST_HEAD(&port_fwds);
+ return 0;
+}
+
+
+void netback_accel_shutdown_fwd()
+{
+ BUG_ON(!list_empty(&port_fwds));
+}
+
+
+/*
+ * Add an entry to the forwarding table. Returns -ENOMEM if no
+ * space.
+ */
+int netback_accel_fwd_add(const __u8 *mac, void *context, void *fwd_priv)
+{
+ struct fwd_struct *fwd;
+ int rc = 0, index;
+ unsigned long flags;
+ cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+ struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+ DECLARE_MAC_BUF(buf);
+
+ BUG_ON(fwd_priv == NULL);
+
+ DPRINTK("Adding mac %s\n", print_mac(buf, mac));
+
+ spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+
+ if ((rc = fwd_find_free_entry(fwd_set)) < 0 ) {
+ spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+ return rc;
+ }
+
+ index = rc;
+
+ /* Shouldn't already be in the table */
+ if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
+ (cuckoo_hash_key *)(&key), &rc) != 0) {
+ spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+ EPRINTK("MAC address %s already accelerated.\n",
+ print_mac(buf, mac));
+ return -EEXIST;
+ }
+
+ if ((rc = cuckoo_hash_add(&fwd_set->fwd_hash_table,
+ (cuckoo_hash_key *)(&key), index, 1)) == 0) {
+ fwd = &fwd_set->fwd_array[index];
+ fwd->valid = 1;
+ fwd->context = context;
+ memcpy(fwd->mac, mac, ETH_ALEN);
+ list_add(&fwd->link, &fwd_set->fwd_list);
+ NETBACK_ACCEL_STATS_OP(global_stats.num_fwds++);
+ }
+
+ spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+
+ /*
+ * No need to tell frontend that this mac address is local -
+ * it should auto-discover through packets on fastpath what is
+ * local and what is not, and just being on same server
+ * doesn't make it local (it could be on a different
+ * bridge)
+ */
+
+ return rc;
+}
+
+
+/* remove an entry from the forwarding tables. */
+void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv)
+{
+ struct fwd_struct *fwd;
+ unsigned long flags;
+ cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+ struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+ DECLARE_MAC_BUF(buf);
+
+ DPRINTK("Removing mac %s\n", print_mac(buf, mac));
+
+ BUG_ON(fwd_priv == NULL);
+
+ spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+
+ fwd = fwd_find_entry(mac, fwd_set);
+ if (fwd != NULL) {
+ BUG_ON(list_empty(&fwd_set->fwd_list));
+ list_del(&fwd->link);
+
+ fwd->valid = 0;
+ cuckoo_hash_remove(&fwd_set->fwd_hash_table,
+ (cuckoo_hash_key *)(&key));
+ NETBACK_ACCEL_STATS_OP(global_stats.num_fwds--);
+ }
+ spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+
+ /*
+ * No need to tell frontend that this is no longer present -
+ * the frontend is currently only interested in remote
+ * addresses and it works these out (mostly) by itself
+ */
+}
+
+
+/* Set the context pointer for a hash table entry. */
+int netback_accel_fwd_set_context(const __u8 *mac, void *context,
+ void *fwd_priv)
+{
+ struct fwd_struct *fwd;
+ unsigned long flags;
+ int rc = -ENOENT;
+ struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+ BUG_ON(fwd_priv == NULL);
+
+ spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+ fwd = fwd_find_entry(mac, fwd_set);
+ if (fwd != NULL) {
+ fwd->context = context;
+ rc = 0;
+ }
+ spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+ return rc;
+}
+
+
+/**************************************************************************
+ * Process a received packet
+ **************************************************************************/
+
+/*
+ * Returns whether or not we have a match in our forward table for the
+ * this skb. Must be called with appropriate fwd_lock already held
+ */
+static struct netback_accel *for_a_vnic(struct netback_pkt_buf *skb,
+ struct port_fwd *fwd_set)
+{
+ struct fwd_struct *fwd;
+ struct netback_accel *retval = NULL;
+
+ fwd = fwd_find_entry(skb->mac.raw, fwd_set);
+ if (fwd != NULL)
+ retval = fwd->context;
+ return retval;
+}
+
+
+static inline int packet_is_arp_reply(struct sk_buff *skb)
+{
+ return skb->protocol == ntohs(ETH_P_ARP)
+ && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
+}
+
+
+static inline void hdr_to_filt(struct ethhdr *ethhdr, struct iphdr *ip,
+ struct netback_accel_filter_spec *spec)
+{
+ spec->proto = ip->protocol;
+ spec->destip_be = ip->daddr;
+ memcpy(spec->mac, ethhdr->h_source, ETH_ALEN);
+
+ if (ip->protocol == IPPROTO_TCP) {
+ struct tcphdr *tcp = (struct tcphdr *)((char *)ip + 4 * ip->ihl);
+ spec->destport_be = tcp->dest;
+ } else {
+ struct udphdr *udp = (struct udphdr *)((char *)ip + 4 * ip->ihl);
+ EPRINTK_ON(ip->protocol != IPPROTO_UDP);
+ spec->destport_be = udp->dest;
+ }
+}
+
+
+static inline int netback_accel_can_filter(struct netback_pkt_buf *skb)
+{
+ return (skb->protocol == htons(ETH_P_IP) &&
+ ((skb->nh.iph->protocol == IPPROTO_TCP) ||
+ (skb->nh.iph->protocol == IPPROTO_UDP)));
+}
+
+
+static inline void netback_accel_filter_packet(struct netback_accel *bend,
+ struct netback_pkt_buf *skb)
+{
+ struct netback_accel_filter_spec fs;
+ struct ethhdr *eh = (struct ethhdr *)(skb->mac.raw);
+
+ hdr_to_filt(eh, skb->nh.iph, &fs);
+
+ netback_accel_filter_check_add(bend, &fs);
+}
+
+
+/*
+ * Receive a packet and do something appropriate with it. Return true
+ * to take exclusive ownership of the packet. This is verging on
+ * solarflare specific
+ */
+void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv)
+{
+ struct netback_accel *bend;
+ struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+ unsigned long flags;
+
+ BUG_ON(fwd_priv == NULL);
+
+ /* Checking for bcast is cheaper so do that first */
+ if (is_broadcast_ether_addr(skb->mac.raw)) {
+ /* pass through the slow path by not claiming ownership */
+ return;
+ } else if (is_multicast_ether_addr(skb->mac.raw)) {
+ /* pass through the slow path by not claiming ownership */
+ return;
+ } else {
+ /* It is unicast */
+ spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+ /* We insert filter to pass it off to a VNIC */
+ if ((bend = for_a_vnic(skb, fwd_set)) != NULL)
+ if (netback_accel_can_filter(skb))
+ netback_accel_filter_packet(bend, skb);
+ spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+ }
+ return;
+}
+
+
+void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv)
+{
+ __u8 *mac;
+ unsigned long flags;
+ struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+ struct fwd_struct *fwd;
+
+ BUG_ON(fwd_priv == NULL);
+
+ if (is_broadcast_ether_addr(skb_mac_header(skb))
+ && packet_is_arp_reply(skb)) {
+ DECLARE_MAC_BUF(buf);
+
+ /*
+ * update our fast path forwarding to reflect this
+ * gratuitous ARP
+ */
+ mac = skb_mac_header(skb)+ETH_ALEN;
+
+ DPRINTK("%s: found gratuitous ARP for %s\n",
+ __FUNCTION__, print_mac(buf, mac));
+
+ spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+ /*
+ * Might not be local, but let's tell them all it is,
+ * and they can restore the fastpath if they continue
+ * to get packets that way
+ */
+ list_for_each_entry(fwd, &fwd_set->fwd_list, link) {
+ struct netback_accel *bend = fwd->context;
+ if (bend != NULL)
+ netback_accel_msg_tx_new_localmac(bend, mac);
+ }
+
+ spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+ }
+ return;
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+#include "accel_solarflare.h"
+
+/* Send a HELLO to front end to start things off */
+void netback_accel_msg_tx_hello(struct netback_accel *bend, unsigned version)
+{
+ unsigned long lock_state;
+ struct net_accel_msg *msg =
+ net_accel_msg_start_send(bend->shared_page,
+ &bend->to_domU, &lock_state);
+ /* The queue _cannot_ be full, we're the first users. */
+ EPRINTK_ON(msg == NULL);
+
+ if (msg != NULL) {
+ net_accel_msg_init(msg, NET_ACCEL_MSG_HELLO);
+ msg->u.hello.version = version;
+ msg->u.hello.max_pages = bend->quotas.max_buf_pages;
+ VPRINTK("Sending hello to channel %d\n", bend->msg_channel);
+ net_accel_msg_complete_send_notify(bend->shared_page,
+ &bend->to_domU,
+ &lock_state,
+ bend->msg_channel_irq);
+ }
+}
+
+/* Send a local mac message to vnic */
+static void netback_accel_msg_tx_localmac(struct netback_accel *bend,
+ int type, const void *mac)
+{
+ unsigned long lock_state;
+ struct net_accel_msg *msg;
+ DECLARE_MAC_BUF(buf);
+
+ BUG_ON(bend == NULL || mac == NULL);
+
+ VPRINTK("Sending local mac message: %s\n", print_mac(buf, mac));
+
+ msg = net_accel_msg_start_send(bend->shared_page, &bend->to_domU,
+ &lock_state);
+
+ if (msg != NULL) {
+ net_accel_msg_init(msg, NET_ACCEL_MSG_LOCALMAC);
+ msg->u.localmac.flags = type;
+ memcpy(msg->u.localmac.mac, mac, ETH_ALEN);
+ net_accel_msg_complete_send_notify(bend->shared_page,
+ &bend->to_domU,
+ &lock_state,
+ bend->msg_channel_irq);
+ } else {
+ /*
+ * TODO if this happens we may leave a domU
+ * fastpathing packets when they should be delivered
+ * locally. Solution is get domU to timeout entries
+ * in its fastpath lookup table when it receives no RX
+ * traffic
+ */
+ EPRINTK("%s: saw full queue, may need ARP timer to recover\n",
+ __FUNCTION__);
+ }
+}
+
+/* Send an add local mac message to vnic */
+void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
+ const void *mac)
+{
+ netback_accel_msg_tx_localmac(bend, NET_ACCEL_MSG_ADD, mac);
+}
+
+
+static int netback_accel_msg_rx_buffer_map(struct netback_accel *bend,
+ struct net_accel_msg *msg)
+{
+ int log2_pages, rc;
+
+ /* Can only allocate in power of two */
+ log2_pages = log2_ge(msg->u.mapbufs.pages, 0);
+ if (msg->u.mapbufs.pages != pow2(log2_pages)) {
+ EPRINTK("%s: Can only alloc bufs in power of 2 sizes (%d)\n",
+ __FUNCTION__, msg->u.mapbufs.pages);
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ /*
+ * Sanity. Assumes NET_ACCEL_MSG_MAX_PAGE_REQ is same for
+ * both directions/domains
+ */
+ if (msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ) {
+ EPRINTK("%s: too many pages in a single message: %d %d\n",
+ __FUNCTION__, msg->u.mapbufs.pages,
+ NET_ACCEL_MSG_MAX_PAGE_REQ);
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ if ((rc = netback_accel_add_buffers(bend, msg->u.mapbufs.pages,
+ log2_pages, msg->u.mapbufs.grants,
+ &msg->u.mapbufs.buf)) < 0) {
+ goto err_out;
+ }
+
+ msg->id |= NET_ACCEL_MSG_REPLY;
+
+ return 0;
+
+ err_out:
+ EPRINTK("%s: err_out\n", __FUNCTION__);
+ msg->id |= NET_ACCEL_MSG_ERROR | NET_ACCEL_MSG_REPLY;
+ return rc;
+}
+
+
+/* Hint from frontend that one of our filters is out of date */
+static int netback_accel_process_fastpath(struct netback_accel *bend,
+ struct net_accel_msg *msg)
+{
+ struct netback_accel_filter_spec spec;
+
+ if (msg->u.fastpath.flags & NET_ACCEL_MSG_REMOVE) {
+ /*
+ * Would be nice to BUG() this but would leave us
+ * vulnerable to naughty frontend
+ */
+ EPRINTK_ON(msg->u.fastpath.flags & NET_ACCEL_MSG_ADD);
+
+ memcpy(spec.mac, msg->u.fastpath.mac, ETH_ALEN);
+ spec.destport_be = msg->u.fastpath.port;
+ spec.destip_be = msg->u.fastpath.ip;
+ spec.proto = msg->u.fastpath.proto;
+
+ netback_accel_filter_remove_spec(bend, &spec);
+ }
+
+ return 0;
+}
+
+
+/* Flow control for message queues */
+inline void set_queue_not_full(struct netback_accel *bend)
+{
+ if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B,
+ (unsigned long *)&bend->shared_page->aflags))
+ notify_remote_via_irq(bend->msg_channel_irq);
+ else
+ VPRINTK("queue not full bit already set, not signalling\n");
+}
+
+
+/* Flow control for message queues */
+inline void set_queue_full(struct netback_accel *bend)
+{
+ if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
+ (unsigned long *)&bend->shared_page->aflags))
+ notify_remote_via_irq(bend->msg_channel_irq);
+ else
+ VPRINTK("queue full bit already set, not signalling\n");
+}
+
+
+void netback_accel_set_interface_state(struct netback_accel *bend, int up)
+{
+ bend->shared_page->net_dev_up = up;
+ if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B,
+ (unsigned long *)&bend->shared_page->aflags))
+ notify_remote_via_irq(bend->msg_channel_irq);
+ else
+ VPRINTK("interface up/down bit already set, not signalling\n");
+}
+
+
+static int check_rx_hello_version(unsigned version)
+{
+ /* Should only happen if there's been a version mismatch */
+ BUG_ON(version == NET_ACCEL_MSG_VERSION);
+
+ if (version > NET_ACCEL_MSG_VERSION) {
+ /* Newer protocol, we must refuse */
+ return -EPROTO;
+ }
+
+ if (version < NET_ACCEL_MSG_VERSION) {
+ /*
+ * We are newer, so have discretion to accept if we
+ * wish. For now however, just reject
+ */
+ return -EPROTO;
+ }
+
+ return -EINVAL;
+}
+
+
+static int process_rx_msg(struct netback_accel *bend,
+ struct net_accel_msg *msg)
+{
+ int err = 0;
+
+ switch (msg->id) {
+ case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO:
+ /* Reply to a HELLO; mark ourselves as connected */
+ DPRINTK("got Hello reply, version %.8x\n",
+ msg->u.hello.version);
+
+ /*
+ * Check that we've not successfully done this
+ * already. NB no check at the moment that this reply
+ * comes after we've actually sent a HELLO as that's
+ * not possible with the current code structure
+ */
+ if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
+ return -EPROTO;
+
+ /* Store max_pages for accel_setup */
+ if (msg->u.hello.max_pages > bend->quotas.max_buf_pages) {
+ EPRINTK("More pages than quota allows (%d > %d)\n",
+ msg->u.hello.max_pages,
+ bend->quotas.max_buf_pages);
+ /* Force it down to the quota */
+ msg->u.hello.max_pages = bend->quotas.max_buf_pages;
+ }
+ bend->max_pages = msg->u.hello.max_pages;
+
+ /* Set up the hardware visible to the other end */
+ err = bend->accel_setup(bend);
+ if (err) {
+ /* This is fatal */
+ DPRINTK("Hello gave accel_setup error %d\n", err);
+ netback_accel_set_closing(bend);
+ } else {
+ /*
+ * Now add the context so that packet
+ * forwarding will commence
+ */
+ netback_accel_fwd_set_context(bend->mac, bend,
+ bend->fwd_priv);
+ }
+ break;
+ case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_ERROR:
+ EPRINTK("got Hello error, versions us:%.8x them:%.8x\n",
+ NET_ACCEL_MSG_VERSION, msg->u.hello.version);
+
+ if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
+ return -EPROTO;
+
+ if (msg->u.hello.version != NET_ACCEL_MSG_VERSION) {
+ /* Error is due to version mismatch */
+ err = check_rx_hello_version(msg->u.hello.version);
+ if (err == 0) {
+ /*
+ * It's OK to be compatible, send
+ * another hello with compatible version
+ */
+ netback_accel_msg_tx_hello
+ (bend, msg->u.hello.version);
+ } else {
+ /*
+ * Tell frontend that we're not going to
+ * send another HELLO by going to Closing.
+ */
+ netback_accel_set_closing(bend);
+ }
+ }
+ break;
+ case NET_ACCEL_MSG_MAPBUF:
+ VPRINTK("Got mapped buffers request %d\n",
+ msg->u.mapbufs.reqid);
+
+ if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
+ return -EPROTO;
+
+ /*
+ * Frontend wants a buffer table entry for the
+ * supplied pages
+ */
+ err = netback_accel_msg_rx_buffer_map(bend, msg);
+ if (net_accel_msg_reply_notify(bend->shared_page,
+ bend->msg_channel_irq,
+ &bend->to_domU, msg)) {
+ /*
+ * This is fatal as we can't tell the frontend
+ * about the problem through the message
+ * queue, and so would otherwise stalemate
+ */
+ netback_accel_set_closing(bend);
+ }
+ break;
+ case NET_ACCEL_MSG_FASTPATH:
+ DPRINTK("Got fastpath request\n");
+
+ if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
+ return -EPROTO;
+
+ err = netback_accel_process_fastpath(bend, msg);
+ break;
+ default:
+ EPRINTK("Huh? Message code is %x\n", msg->id);
+ err = -EPROTO;
+ break;
+ }
+ return err;
+}
+
+
+/* Demultiplex an IRQ from the frontend driver. */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+void netback_accel_msg_rx_handler(struct work_struct *arg)
+#else
+void netback_accel_msg_rx_handler(void *bend_void)
+#endif
+{
+ struct net_accel_msg msg;
+ int err, queue_was_full = 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+ struct netback_accel *bend =
+ container_of(arg, struct netback_accel, handle_msg);
+#else
+ struct netback_accel *bend = (struct netback_accel *)bend_void;
+#endif
+
+ mutex_lock(&bend->bend_mutex);
+
+ /*
+ * This happens when the shared pages have been unmapped, but
+ * the workqueue not flushed yet
+ */
+ if (bend->shared_page == NULL)
+ goto done;
+
+ if ((bend->shared_page->aflags &
+ NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK) != 0) {
+ if (bend->shared_page->aflags &
+ NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL) {
+ /* We've been told there may now be space. */
+ clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B,
+ (unsigned long *)&bend->shared_page->aflags);
+ }
+
+ if (bend->shared_page->aflags &
+ NET_ACCEL_MSG_AFLAGS_QUEUEUFULL) {
+ clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B,
+ (unsigned long *)&bend->shared_page->aflags);
+ queue_was_full = 1;
+ }
+ }
+
+ while ((err = net_accel_msg_recv(bend->shared_page, &bend->from_domU,
+ &msg)) == 0) {
+ err = process_rx_msg(bend, &msg);
+
+ if (err != 0) {
+ EPRINTK("%s: Error %d\n", __FUNCTION__, err);
+ goto err;
+ }
+ }
+
+ err:
+ /* There will be space now if we can make any. */
+ if (queue_was_full)
+ set_queue_not_full(bend);
+ done:
+ mutex_unlock(&bend->bend_mutex);
+
+ return;
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include "common.h"
+
+#include "accel.h"
+#include "accel_solarflare.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+
+#include "accel_cuckoo_hash.h"
+
+#include "ci/driver/resource/efx_vi.h"
+
+#include "ci/efrm/nic_table.h"
+#include "ci/efhw/public.h"
+
+#include <xen/evtchn.h>
+#include <xen/driver_util.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#include "driverlink_api.h"
+
+#define SF_XEN_RX_USR_BUF_SIZE 2048
+
+struct falcon_bend_accel_priv {
+ struct efx_vi_state *efx_vih;
+
+ /*! Array of pointers to dma_map state, used so VNIC can
+ * request their removal in a single message
+ */
+ struct efx_vi_dma_map_state **dma_maps;
+ /*! Index into dma_maps */
+ int dma_maps_index;
+
+ /*! Serialises access to filters */
+ spinlock_t filter_lock;
+ /*! Bitmap of which filters are free */
+ unsigned long free_filters;
+ /*! Used for index normalisation */
+ u32 filter_idx_mask;
+ struct netback_accel_filter_spec *fspecs;
+ cuckoo_hash_table filter_hash_table;
+
+ u32 txdmaq_gnt;
+ u32 rxdmaq_gnt;
+ u32 doorbell_gnt;
+ u32 evq_rptr_gnt;
+ u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES];
+ u32 evq_npages;
+};
+
+/* Forward declaration */
+static int netback_accel_filter_init(struct netback_accel *);
+static void netback_accel_filter_shutdown(struct netback_accel *);
+
+/**************************************************************************
+ *
+ * Driverlink stuff
+ *
+ **************************************************************************/
+
+struct driverlink_port {
+ struct list_head link;
+ enum net_accel_hw_type type;
+ struct net_device *net_dev;
+ struct efx_dl_device *efx_dl_dev;
+ int nic_index;
+ void *fwd_priv;
+};
+
+static struct list_head dl_ports;
+
+/* This mutex protects global state, such as the dl_ports list */
+DEFINE_MUTEX(accel_mutex);
+
+static int init_done = 0;
+
+/* The DL callbacks */
+
+
+#if defined(EFX_USE_FASTCALL)
+static enum efx_veto fastcall
+#else
+static enum efx_veto
+#endif
+bend_dl_tx_packet(struct efx_dl_device *efx_dl_dev,
+ struct sk_buff *skb)
+{
+ struct driverlink_port *port = efx_dl_dev->priv;
+
+ BUG_ON(port == NULL);
+
+ NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
+ if (skb_mac_header_was_set(skb))
+ netback_accel_tx_packet(skb, port->fwd_priv);
+ else {
+ DPRINTK("Ignoring packet with missing mac address\n");
+ NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_bad_packets++);
+ }
+ return EFX_ALLOW_PACKET;
+}
+
+/* EFX_USE_FASTCALL */
+#if defined(EFX_USE_FASTCALL)
+static enum efx_veto fastcall
+#else
+static enum efx_veto
+#endif
+bend_dl_rx_packet(struct efx_dl_device *efx_dl_dev,
+ const char *pkt_buf, int pkt_len)
+{
+ struct driverlink_port *port = efx_dl_dev->priv;
+ struct netback_pkt_buf pkt;
+ struct ethhdr *eh;
+
+ BUG_ON(port == NULL);
+
+ pkt.mac.raw = (char *)pkt_buf;
+ pkt.nh.raw = (char *)pkt_buf + ETH_HLEN;
+ eh = (struct ethhdr *)pkt_buf;
+ pkt.protocol = eh->h_proto;
+
+ NETBACK_ACCEL_STATS_OP(global_stats.dl_rx_packets++);
+ netback_accel_rx_packet(&pkt, port->fwd_priv);
+ return EFX_ALLOW_PACKET;
+}
+
+
+/* Callbacks we'd like to get from the netdriver through driverlink */
+struct efx_dl_callbacks bend_dl_callbacks =
+ {
+ .tx_packet = bend_dl_tx_packet,
+ .rx_packet = bend_dl_rx_packet,
+ };
+
+
+static struct netback_accel_hooks accel_hooks = {
+ THIS_MODULE,
+ &netback_accel_probe,
+ &netback_accel_remove
+};
+
+
+/*
+ * Handy helper which given an efx_dl_device works out which
+ * efab_nic_t index into efrm_nic_table.nics[] it corresponds to
+ */
+static int efx_device_to_efab_nic_index(struct efx_dl_device *efx_dl_dev)
+{
+ int i;
+
+ for (i = 0; i < EFHW_MAX_NR_DEVS; i++) {
+ struct efhw_nic *nic = efrm_nic_tablep->nic[i];
+
+ /*
+ * It's possible for the nic structure to have not
+ * been initialised if the resource driver failed its
+ * driverlink probe
+ */
+ if (nic == NULL || nic->net_driver_dev == NULL)
+ continue;
+
+ /* Work out if these are talking about the same NIC */
+ if (nic->net_driver_dev->pci_dev == efx_dl_dev->pci_dev)
+ return i;
+ }
+
+ return -1;
+}
+
+
+/* Driver link probe - register our callbacks */
+static int bend_dl_probe(struct efx_dl_device *efx_dl_dev,
+ const struct net_device *net_dev,
+ const struct efx_dl_device_info *dev_info,
+ const char* silicon_rev)
+{
+ int rc;
+ enum net_accel_hw_type type;
+ struct driverlink_port *port;
+
+ DPRINTK("%s: %s\n", __FUNCTION__, silicon_rev);
+
+ if (strcmp(silicon_rev, "falcon/a1") == 0)
+ type = NET_ACCEL_MSG_HWTYPE_FALCON_A;
+ else if (strcmp(silicon_rev, "falcon/b0") == 0)
+ type = NET_ACCEL_MSG_HWTYPE_FALCON_B;
+ else {
+ EPRINTK("%s: unsupported silicon %s\n", __FUNCTION__,
+ silicon_rev);
+ rc = -EINVAL;
+ goto fail1;
+ }
+
+ port = kmalloc(sizeof(struct driverlink_port), GFP_KERNEL);
+ if (port == NULL) {
+ EPRINTK("%s: no memory for dl probe\n", __FUNCTION__);
+ rc = -ENOMEM;
+ goto fail1;
+ }
+
+ port->efx_dl_dev = efx_dl_dev;
+ efx_dl_dev->priv = port;
+
+ port->nic_index = efx_device_to_efab_nic_index(efx_dl_dev);
+ if (port->nic_index < 0) {
+ /*
+ * This can happen in theory if the resource driver
+ * failed to initialise properly
+ */
+ EPRINTK("%s: nic structure not found\n", __FUNCTION__);
+ rc = -EINVAL;
+ goto fail2;
+ }
+
+ port->fwd_priv = netback_accel_init_fwd_port();
+ if (port->fwd_priv == NULL) {
+ EPRINTK("%s: failed to set up forwarding for port\n",
+ __FUNCTION__);
+ rc = -ENOMEM;
+ goto fail2;
+ }
+
+ rc = efx_dl_register_callbacks(efx_dl_dev, &bend_dl_callbacks);
+ if (rc != 0) {
+ EPRINTK("%s: register_callbacks failed\n", __FUNCTION__);
+ goto fail3;
+ }
+
+ port->type = type;
+ port->net_dev = (struct net_device *)net_dev;
+
+ mutex_lock(&accel_mutex);
+ list_add(&port->link, &dl_ports);
+ mutex_unlock(&accel_mutex);
+
+ rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0,
+ port->net_dev->name, &accel_hooks);
+
+ if (rc < 0) {
+ EPRINTK("Xen netback accelerator version mismatch\n");
+ goto fail4;
+ } else if (rc > 0) {
+ /*
+ * In future may want to add backwards compatibility
+ * and accept certain subsets of previous versions
+ */
+ EPRINTK("Xen netback accelerator version mismatch\n");
+ goto fail4;
+ }
+
+ return 0;
+
+ fail4:
+ mutex_lock(&accel_mutex);
+ list_del(&port->link);
+ mutex_unlock(&accel_mutex);
+
+ efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+ fail3:
+ netback_accel_shutdown_fwd_port(port->fwd_priv);
+ fail2:
+ efx_dl_dev->priv = NULL;
+ kfree(port);
+ fail1:
+ return rc;
+}
+
+
+static void bend_dl_remove(struct efx_dl_device *efx_dl_dev)
+{
+ struct driverlink_port *port;
+
+ DPRINTK("Unregistering driverlink callbacks.\n");
+
+ mutex_lock(&accel_mutex);
+
+ port = (struct driverlink_port *)efx_dl_dev->priv;
+
+ BUG_ON(list_empty(&dl_ports));
+ BUG_ON(port == NULL);
+ BUG_ON(port->efx_dl_dev != efx_dl_dev);
+
+ netback_disconnect_accelerator(0, port->net_dev->name);
+
+ list_del(&port->link);
+
+ mutex_unlock(&accel_mutex);
+
+ efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+ netback_accel_shutdown_fwd_port(port->fwd_priv);
+
+ efx_dl_dev->priv = NULL;
+ kfree(port);
+
+ return;
+}
+
+
+static struct efx_dl_driver bend_dl_driver =
+ {
+ .name = "SFC Xen backend",
+ .probe = bend_dl_probe,
+ .remove = bend_dl_remove,
+ };
+
+
+int netback_accel_sf_init(void)
+{
+ int rc, nic_i;
+ struct efhw_nic *nic;
+
+ INIT_LIST_HEAD(&dl_ports);
+
+ rc = efx_dl_register_driver(&bend_dl_driver);
+ /* If we couldn't find the NET driver, give up */
+ if (rc == -ENOENT)
+ return rc;
+
+ if (rc == 0) {
+ EFRM_FOR_EACH_NIC(nic_i, nic)
+ falcon_nic_set_rx_usr_buf_size(nic,
+ SF_XEN_RX_USR_BUF_SIZE);
+ }
+
+ init_done = (rc == 0);
+ return rc;
+}
+
+
+void netback_accel_sf_shutdown(void)
+{
+ if (!init_done)
+ return;
+ DPRINTK("Unregistering driverlink driver\n");
+
+ /*
+ * This will trigger removal callbacks for all the devices, which
+ * will unregister their callbacks, disconnect from netfront, etc.
+ */
+ efx_dl_unregister_driver(&bend_dl_driver);
+}
+
+
+int netback_accel_sf_hwtype(struct netback_accel *bend)
+{
+ struct driverlink_port *port;
+
+ mutex_lock(&accel_mutex);
+
+ list_for_each_entry(port, &dl_ports, link) {
+ if (strcmp(bend->nicname, port->net_dev->name) == 0) {
+ bend->hw_type = port->type;
+ bend->accel_setup = netback_accel_setup_vnic_hw;
+ bend->accel_shutdown = netback_accel_shutdown_vnic_hw;
+ bend->fwd_priv = port->fwd_priv;
+ /* This is just needed to pass to efx_vi_alloc */
+ bend->nic_index = port->nic_index;
+ bend->net_dev = port->net_dev;
+ mutex_unlock(&accel_mutex);
+ return 0;
+ }
+ }
+
+ mutex_unlock(&accel_mutex);
+
+ EPRINTK("Failed to identify backend device '%s' with a NIC\n",
+ bend->nicname);
+
+ return -ENOENT;
+}
+
+
+/****************************************************************************
+ * Resource management code
+ ***************************************************************************/
+
+static int alloc_page_state(struct netback_accel *bend, int max_pages)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv;
+
+ if (max_pages < 0 || max_pages > bend->quotas.max_buf_pages) {
+ EPRINTK("%s: invalid max_pages: %d\n", __FUNCTION__, max_pages);
+ return -EINVAL;
+ }
+
+ accel_hw_priv = kzalloc(sizeof(struct falcon_bend_accel_priv),
+ GFP_KERNEL);
+ if (accel_hw_priv == NULL) {
+ EPRINTK("%s: no memory for accel_hw_priv\n", __FUNCTION__);
+ return -ENOMEM;
+ }
+
+ accel_hw_priv->dma_maps = kzalloc
+ (sizeof(struct efx_vi_dma_map_state **) *
+ (max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ), GFP_KERNEL);
+ if (accel_hw_priv->dma_maps == NULL) {
+ EPRINTK("%s: no memory for dma_maps\n", __FUNCTION__);
+ kfree(accel_hw_priv);
+ return -ENOMEM;
+ }
+
+ bend->buffer_maps = kzalloc(sizeof(struct vm_struct *) * max_pages,
+ GFP_KERNEL);
+ if (bend->buffer_maps == NULL) {
+ EPRINTK("%s: no memory for buffer_maps\n", __FUNCTION__);
+ kfree(accel_hw_priv->dma_maps);
+ kfree(accel_hw_priv);
+ return -ENOMEM;
+ }
+
+ bend->buffer_addrs = kzalloc(sizeof(u64) * max_pages, GFP_KERNEL);
+ if (bend->buffer_addrs == NULL) {
+ kfree(bend->buffer_maps);
+ kfree(accel_hw_priv->dma_maps);
+ kfree(accel_hw_priv);
+ return -ENOMEM;
+ }
+
+ bend->accel_hw_priv = accel_hw_priv;
+
+ return 0;
+}
+
+
+static int free_page_state(struct netback_accel *bend)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv;
+
+ DPRINTK("%s: %p\n", __FUNCTION__, bend);
+
+ accel_hw_priv = bend->accel_hw_priv;
+
+ if (accel_hw_priv) {
+ kfree(accel_hw_priv->dma_maps);
+ kfree(bend->buffer_maps);
+ kfree(bend->buffer_addrs);
+ kfree(accel_hw_priv);
+ bend->accel_hw_priv = NULL;
+ bend->max_pages = 0;
+ }
+
+ return 0;
+}
+
+
+/* The timeout event callback for the event q */
+static void bend_evq_timeout(void *context, int is_timeout)
+{
+ struct netback_accel *bend = (struct netback_accel *)context;
+ if (is_timeout) {
+ /* Pass event to vnic front end driver */
+ VPRINTK("timeout event to %d\n", bend->net_channel);
+ NETBACK_ACCEL_STATS_OP(bend->stats.evq_timeouts++);
+ notify_remote_via_irq(bend->net_channel_irq);
+ } else {
+ /* It's a wakeup event, used by Falcon */
+ VPRINTK("wakeup to %d\n", bend->net_channel);
+ NETBACK_ACCEL_STATS_OP(bend->stats.evq_wakeups++);
+ notify_remote_via_irq(bend->net_channel_irq);
+ }
+}
+
+
+/*
+ * Create the eventq and associated gubbins for communication with the
+ * front end vnic driver
+ */
+static int ef_get_vnic(struct netback_accel *bend)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv;
+ int rc = 0;
+
+ BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_NONE);
+
+ /* Allocate page related state and accel_hw_priv */
+ rc = alloc_page_state(bend, bend->max_pages);
+ if (rc != 0) {
+ EPRINTK("Failed to allocate page state: %d\n", rc);
+ return rc;
+ }
+
+ accel_hw_priv = bend->accel_hw_priv;
+
+ rc = efx_vi_alloc(&accel_hw_priv->efx_vih, bend->nic_index);
+ if (rc != 0) {
+ EPRINTK("%s: efx_vi_alloc failed %d\n", __FUNCTION__, rc);
+ free_page_state(bend);
+ return rc;
+ }
+
+ rc = efx_vi_eventq_register_callback(accel_hw_priv->efx_vih,
+ bend_evq_timeout,
+ bend);
+ if (rc != 0) {
+ EPRINTK("%s: register_callback failed %d\n", __FUNCTION__, rc);
+ efx_vi_free(accel_hw_priv->efx_vih);
+ free_page_state(bend);
+ return rc;
+ }
+
+ bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
+
+ return 0;
+}
+
+
+static void ef_free_vnic(struct netback_accel *bend)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+ BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
+
+ efx_vi_eventq_kill_callback(accel_hw_priv->efx_vih);
+
+ DPRINTK("Hardware is freeable. Will proceed.\n");
+
+ efx_vi_free(accel_hw_priv->efx_vih);
+ accel_hw_priv->efx_vih = NULL;
+
+ VPRINTK("Free page state...\n");
+ free_page_state(bend);
+
+ bend->hw_state = NETBACK_ACCEL_RES_NONE;
+}
+
+
+static inline void ungrant_or_crash(grant_ref_t gntref, int domain) {
+ if (net_accel_ungrant_page(gntref) == -EBUSY)
+ net_accel_shutdown_remote(domain);
+}
+
+
+static void netback_accel_release_hwinfo(struct netback_accel *bend)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ int i;
+
+ DPRINTK("Remove dma q grants %d %d\n", accel_hw_priv->txdmaq_gnt,
+ accel_hw_priv->rxdmaq_gnt);
+ ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);
+ ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
+
+ DPRINTK("Remove doorbell grant %d\n", accel_hw_priv->doorbell_gnt);
+ ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
+
+ if (bend->hw_type == NET_ACCEL_MSG_HWTYPE_FALCON_A) {
+ DPRINTK("Remove rptr grant %d\n", accel_hw_priv->evq_rptr_gnt);
+ ungrant_or_crash(accel_hw_priv->evq_rptr_gnt, bend->far_end);
+ }
+
+ for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+ DPRINTK("Remove evq grant %d\n", accel_hw_priv->evq_mem_gnts[i]);
+ ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end);
+ }
+
+ bend->hw_state = NETBACK_ACCEL_RES_FILTER;
+
+ return;
+}
+
+
+static int ef_bend_hwinfo_falcon_common(struct netback_accel *bend,
+ struct net_accel_hw_falcon_b *hwinfo)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ struct efx_vi_hw_resource_metadata res_mdata;
+ struct efx_vi_hw_resource res_array[EFX_VI_HW_RESOURCE_MAXSIZE];
+ int rc, len = EFX_VI_HW_RESOURCE_MAXSIZE, i, pfn = 0;
+ unsigned long txdmaq_pfn = 0, rxdmaq_pfn = 0;
+
+ rc = efx_vi_hw_resource_get_phys(accel_hw_priv->efx_vih, &res_mdata,
+ res_array, &len);
+ if (rc != 0) {
+ DPRINTK("%s: resource_get_phys returned %d\n",
+ __FUNCTION__, rc);
+ return rc;
+ }
+
+ if (res_mdata.version != 0)
+ return -EPROTO;
+
+ hwinfo->nic_arch = res_mdata.nic_arch;
+ hwinfo->nic_variant = res_mdata.nic_variant;
+ hwinfo->nic_revision = res_mdata.nic_revision;
+
+ hwinfo->evq_order = res_mdata.evq_order;
+ hwinfo->evq_offs = res_mdata.evq_offs;
+ hwinfo->evq_capacity = res_mdata.evq_capacity;
+ hwinfo->instance = res_mdata.instance;
+ hwinfo->rx_capacity = res_mdata.rx_capacity;
+ hwinfo->tx_capacity = res_mdata.tx_capacity;
+
+ VPRINTK("evq_order %d evq_offs %d evq_cap %d inst %d rx_cap %d tx_cap %d\n",
+ hwinfo->evq_order, hwinfo->evq_offs, hwinfo->evq_capacity,
+ hwinfo->instance, hwinfo->rx_capacity, hwinfo->tx_capacity);
+
+ for (i = 0; i < len; i++) {
+ struct efx_vi_hw_resource *res = &(res_array[i]);
+ switch (res->type) {
+ case EFX_VI_HW_RESOURCE_TXDMAQ:
+ txdmaq_pfn = page_to_pfn(virt_to_page(res->address));
+ break;
+ case EFX_VI_HW_RESOURCE_RXDMAQ:
+ rxdmaq_pfn = page_to_pfn(virt_to_page(res->address));
+ break;
+ case EFX_VI_HW_RESOURCE_EVQTIMER:
+ break;
+ case EFX_VI_HW_RESOURCE_EVQRPTR:
+ case EFX_VI_HW_RESOURCE_EVQRPTR_OFFSET:
+ hwinfo->evq_rptr = res->address;
+ break;
+ case EFX_VI_HW_RESOURCE_EVQMEMKVA:
+ accel_hw_priv->evq_npages = 1 << res_mdata.evq_order;
+ pfn = page_to_pfn(virt_to_page(res->address));
+ break;
+ case EFX_VI_HW_RESOURCE_BELLPAGE:
+ hwinfo->doorbell_mfn = res->address;
+ break;
+ default:
+ EPRINTK("%s: Unknown hardware resource type %d\n",
+ __FUNCTION__, res->type);
+ break;
+ }
+ }
+
+ VPRINTK("Passing txdmaq page pfn %lx\n", txdmaq_pfn);
+ accel_hw_priv->txdmaq_gnt = hwinfo->txdmaq_gnt =
+ net_accel_grant_page(bend->hdev_data, pfn_to_mfn(txdmaq_pfn),
+ 0);
+
+ VPRINTK("Passing rxdmaq page pfn %lx\n", rxdmaq_pfn);
+ accel_hw_priv->rxdmaq_gnt = hwinfo->rxdmaq_gnt =
+ net_accel_grant_page(bend->hdev_data, pfn_to_mfn(rxdmaq_pfn),
+ 0);
+
+ VPRINTK("Passing doorbell page mfn %x\n", hwinfo->doorbell_mfn);
+ /* Make the relevant H/W pages mappable by the far end */
+ accel_hw_priv->doorbell_gnt = hwinfo->doorbell_gnt =
+ net_accel_grant_page(bend->hdev_data, hwinfo->doorbell_mfn, 1);
+
+ /* Now do the same for the memory pages */
+ /* Convert the page + length we got back for the evq to grants. */
+ for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+ accel_hw_priv->evq_mem_gnts[i] = hwinfo->evq_mem_gnts[i] =
+ net_accel_grant_page(bend->hdev_data, pfn_to_mfn(pfn), 0);
+ VPRINTK("Got grant %u for evq pfn %x\n", hwinfo->evq_mem_gnts[i],
+ pfn);
+ pfn++;
+ }
+
+ return 0;
+}
+
+
+static int ef_bend_hwinfo_falcon_a(struct netback_accel *bend,
+ struct net_accel_hw_falcon_a *hwinfo)
+{
+ int rc;
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+ if ((rc = ef_bend_hwinfo_falcon_common(bend, &hwinfo->common)) != 0)
+ return rc;
+
+ /*
+ * Note that unlike the above, where the message field is the
+ * page number, here evq_rptr is the entire address because
+ * it is currently a pointer into the densely mapped timer page.
+ */
+ VPRINTK("Passing evq_rptr pfn %x for rptr %x\n",
+ hwinfo->common.evq_rptr >> PAGE_SHIFT,
+ hwinfo->common.evq_rptr);
+ rc = net_accel_grant_page(bend->hdev_data,
+ hwinfo->common.evq_rptr >> PAGE_SHIFT, 0);
+ if (rc < 0)
+ return rc;
+
+ accel_hw_priv->evq_rptr_gnt = hwinfo->evq_rptr_gnt = rc;
+ VPRINTK("evq_rptr_gnt got %d\n", hwinfo->evq_rptr_gnt);
+
+ return 0;
+}
+
+
+static int ef_bend_hwinfo_falcon_b(struct netback_accel *bend,
+ struct net_accel_hw_falcon_b *hwinfo)
+{
+ return ef_bend_hwinfo_falcon_common(bend, hwinfo);
+}
+
+
+/*
+ * Fill in the message with a description of the hardware resources, based on
+ * the H/W type
+ */
+static int netback_accel_hwinfo(struct netback_accel *bend,
+ struct net_accel_msg_hw *msgvi)
+{
+ int rc = 0;
+
+ BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
+
+ msgvi->type = bend->hw_type;
+ switch (bend->hw_type) {
+ case NET_ACCEL_MSG_HWTYPE_FALCON_A:
+ rc = ef_bend_hwinfo_falcon_a(bend, &msgvi->resources.falcon_a);
+ break;
+ case NET_ACCEL_MSG_HWTYPE_FALCON_B:
+ rc = ef_bend_hwinfo_falcon_b(bend, &msgvi->resources.falcon_b);
+ break;
+ case NET_ACCEL_MSG_HWTYPE_NONE:
+ /* Nothing to do. The slow path should just work. */
+ break;
+ }
+
+ if (rc == 0)
+ bend->hw_state = NETBACK_ACCEL_RES_HWINFO;
+
+ return rc;
+}
+
+
+/* Allocate hardware resources and make them available to the client domain */
+int netback_accel_setup_vnic_hw(struct netback_accel *bend)
+{
+ struct net_accel_msg msg;
+ int err;
+
+ /* Allocate the event queue, VI and so on. */
+ err = ef_get_vnic(bend);
+ if (err) {
+ EPRINTK("Failed to allocate hardware resource for bend:"
+ "error %d\n", err);
+ return err;
+ }
+
+ /* Set up the filter management */
+ err = netback_accel_filter_init(bend);
+ if (err) {
+ EPRINTK("Filter setup failed, error %d", err);
+ ef_free_vnic(bend);
+ return err;
+ }
+
+ net_accel_msg_init(&msg, NET_ACCEL_MSG_SETHW);
+
+ /*
+ * Extract the low-level hardware info we will actually pass to the
+ * other end, and set up the grants/ioremap permissions needed
+ */
+ err = netback_accel_hwinfo(bend, &msg.u.hw);
+
+ if (err != 0) {
+ netback_accel_filter_shutdown(bend);
+ ef_free_vnic(bend);
+ return err;
+ }
+
+ /* Send the message, this is a reply to a hello-reply */
+ err = net_accel_msg_reply_notify(bend->shared_page,
+ bend->msg_channel_irq,
+ &bend->to_domU, &msg);
+
+ /*
+ * The message should succeed as it's logically a reply and we
+ * guarantee space for replies, but a misbehaving frontend
+ * could result in that behaviour, so be tolerant
+ */
+ if (err != 0) {
+ netback_accel_release_hwinfo(bend);
+ netback_accel_filter_shutdown(bend);
+ ef_free_vnic(bend);
+ }
+
+ return err;
+}
+
+
+/* Free hardware resources */
+void netback_accel_shutdown_vnic_hw(struct netback_accel *bend)
+{
+ /*
+ * Only try and release resources if accel_hw_priv was setup,
+ * otherwise there is nothing to do as we're on "null-op"
+ * acceleration
+ */
+ switch (bend->hw_state) {
+ case NETBACK_ACCEL_RES_HWINFO:
+ VPRINTK("Release hardware resources\n");
+ netback_accel_release_hwinfo(bend);
+ /* deliberate drop through */
+ case NETBACK_ACCEL_RES_FILTER:
+ VPRINTK("Free filters...\n");
+ netback_accel_filter_shutdown(bend);
+ /* deliberate drop through */
+ case NETBACK_ACCEL_RES_ALLOC:
+ VPRINTK("Free vnic...\n");
+ ef_free_vnic(bend);
+ /* deliberate drop through */
+ case NETBACK_ACCEL_RES_NONE:
+ break;
+ default:
+ BUG();
+ }
+}
+
+/**************************************************************************
+ *
+ * Buffer table stuff
+ *
+ **************************************************************************/
+
+/*
+ * Undo any allocation that netback_accel_msg_rx_buffer_map() has made
+ * if it fails half way through
+ */
+static inline void buffer_map_cleanup(struct netback_accel *bend, int i)
+{
+ while (i > 0) {
+ i--;
+ bend->buffer_maps_index--;
+ net_accel_unmap_device_page(bend->hdev_data,
+ bend->buffer_maps[bend->buffer_maps_index],
+ bend->buffer_addrs[bend->buffer_maps_index]);
+ }
+}
+
+
+int netback_accel_add_buffers(struct netback_accel *bend, int pages, int log2_pages,
+ u32 *grants, u32 *buf_addr_out)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ unsigned long long addr_array[NET_ACCEL_MSG_MAX_PAGE_REQ];
+ int rc, i, index;
+ u64 dev_bus_addr;
+
+ /* Make sure we can't overflow the dma_maps array */
+ if (accel_hw_priv->dma_maps_index >=
+ bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ) {
+ EPRINTK("%s: too many buffer table allocations: %d %d\n",
+ __FUNCTION__, accel_hw_priv->dma_maps_index,
+ bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ);
+ return -EINVAL;
+ }
+
+ /* Make sure we can't overflow the buffer_maps array */
+ if (bend->buffer_maps_index + pages > bend->max_pages) {
+ EPRINTK("%s: too many pages mapped: %d + %d > %d\n",
+ __FUNCTION__, bend->buffer_maps_index,
+ pages, bend->max_pages);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < pages; i++) {
+ VPRINTK("%s: mapping page %d\n", __FUNCTION__, i);
+ rc = net_accel_map_device_page
+ (bend->hdev_data, grants[i],
+ &bend->buffer_maps[bend->buffer_maps_index],
+ &dev_bus_addr);
+
+ if (rc != 0) {
+ EPRINTK("error in net_accel_map_device_page\n");
+ buffer_map_cleanup(bend, i);
+ return rc;
+ }
+
+ bend->buffer_addrs[bend->buffer_maps_index] = dev_bus_addr;
+
+ bend->buffer_maps_index++;
+
+ addr_array[i] = dev_bus_addr;
+ }
+
+ VPRINTK("%s: mapping dma addresses to vih %p\n", __FUNCTION__,
+ accel_hw_priv->efx_vih);
+
+ index = accel_hw_priv->dma_maps_index;
+ if ((rc = efx_vi_dma_map_addrs(accel_hw_priv->efx_vih, addr_array, pages,
+ &(accel_hw_priv->dma_maps[index]))) < 0) {
+ EPRINTK("error in dma_map_pages\n");
+ buffer_map_cleanup(bend, i);
+ return rc;
+ }
+
+ accel_hw_priv->dma_maps_index++;
+ NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages += pages);
+
+ //DPRINTK("%s: getting map address\n", __FUNCTION__);
+
+ *buf_addr_out = efx_vi_dma_get_map_addr(accel_hw_priv->efx_vih,
+ accel_hw_priv->dma_maps[index]);
+
+ //DPRINTK("%s: done\n", __FUNCTION__);
+
+ return 0;
+}
+
+
+int netback_accel_remove_buffers(struct netback_accel *bend)
+{
+ /* Only try to free buffers if accel_hw_priv was setup */
+ if (bend->hw_state != NETBACK_ACCEL_RES_NONE) {
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ int i;
+
+ efx_vi_reset(accel_hw_priv->efx_vih);
+
+ while (accel_hw_priv->dma_maps_index > 0) {
+ accel_hw_priv->dma_maps_index--;
+ i = accel_hw_priv->dma_maps_index;
+ efx_vi_dma_unmap_addrs(accel_hw_priv->efx_vih,
+ accel_hw_priv->dma_maps[i]);
+ }
+
+ while (bend->buffer_maps_index > 0) {
+ VPRINTK("Unmapping granted buffer %d\n",
+ bend->buffer_maps_index);
+ bend->buffer_maps_index--;
+ i = bend->buffer_maps_index;
+ net_accel_unmap_device_page(bend->hdev_data,
+ bend->buffer_maps[i],
+ bend->buffer_addrs[i]);
+ }
+
+ NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages = 0);
+ }
+
+ return 0;
+}
+
+/**************************************************************************
+ *
+ * Filter stuff
+ *
+ **************************************************************************/
+
+static int netback_accel_filter_init(struct netback_accel *bend)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ int i, rc;
+
+ BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
+
+ spin_lock_init(&accel_hw_priv->filter_lock);
+
+ if ((rc = cuckoo_hash_init(&accel_hw_priv->filter_hash_table,
+ 5 /* space for 32 filters */, 8)) != 0) {
+ EPRINTK("Failed to initialise filter hash table\n");
+ return rc;
+ }
+
+ accel_hw_priv->fspecs = kzalloc(sizeof(struct netback_accel_filter_spec) *
+ bend->quotas.max_filters,
+ GFP_KERNEL);
+
+ if (accel_hw_priv->fspecs == NULL) {
+ EPRINTK("No memory for filter specs.\n");
+ cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < bend->quotas.max_filters; i++) {
+ accel_hw_priv->free_filters |= (1 << i);
+ }
+
+ /* Base mask on highest set bit in max_filters */
+ accel_hw_priv->filter_idx_mask = (1 << fls(bend->quotas.max_filters)) - 1;
+ VPRINTK("filter setup: max is %x mask is %x\n",
+ bend->quotas.max_filters, accel_hw_priv->filter_idx_mask);
+
+ bend->hw_state = NETBACK_ACCEL_RES_FILTER;
+
+ return 0;
+}
+
+
+static inline void make_filter_key(cuckoo_hash_ip_key *key,
+ struct netback_accel_filter_spec *filt)
+
+{
+ key->local_ip = filt->destip_be;
+ key->local_port = filt->destport_be;
+ key->proto = filt->proto;
+}
+
+
+static inline
+void netback_accel_free_filter(struct falcon_bend_accel_priv *accel_hw_priv,
+ int filter)
+{
+ cuckoo_hash_ip_key filter_key;
+
+ if (!(accel_hw_priv->free_filters & (1 << filter))) {
+ efx_vi_filter_stop(accel_hw_priv->efx_vih,
+ accel_hw_priv->fspecs[filter].filter_handle);
+ make_filter_key(&filter_key, &(accel_hw_priv->fspecs[filter]));
+ if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
+ (cuckoo_hash_key *)&filter_key)) {
+ EPRINTK("%s: Couldn't find filter to remove from table\n",
+ __FUNCTION__);
+ BUG();
+ }
+ }
+}
+
+
+static void netback_accel_filter_shutdown(struct netback_accel *bend)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ int i;
+ unsigned long flags;
+
+ BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
+
+ spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+
+ BUG_ON(accel_hw_priv->fspecs == NULL);
+
+ for (i = 0; i < bend->quotas.max_filters; i++) {
+ netback_accel_free_filter(accel_hw_priv, i);
+ }
+
+ kfree(accel_hw_priv->fspecs);
+ accel_hw_priv->fspecs = NULL;
+ accel_hw_priv->free_filters = 0;
+
+ cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
+
+ spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+
+ bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
+}
+
+
+/*! Suggest a filter to replace when we want to insert a new one and have
+ * none free.
+ */
+static unsigned get_victim_filter(struct netback_accel *bend)
+{
+ /*
+ * We could attempt to get really clever, and may do at some
+ * point, but random replacement is v. cheap and low on
+ * pathological worst cases.
+ */
+ unsigned index, cycles;
+
+ rdtscl(cycles);
+
+ /*
+ * Some doubt about the quality of the bottom few bits, so
+ * throw 'em * away
+ */
+ index = (cycles >> 4) & ((struct falcon_bend_accel_priv *)
+ bend->accel_hw_priv)->filter_idx_mask;
+ /*
+ * We don't enforce that the number of filters is a power of
+ * two, but the masking gets us to within one subtraction of a
+ * valid index
+ */
+ if (index >= bend->quotas.max_filters)
+ index -= bend->quotas.max_filters;
+ DPRINTK("backend %s->%d has no free filters. Filter %d will be evicted\n",
+ bend->nicname, bend->far_end, index);
+ return index;
+}
+
+
+/* Add a filter for the specified IP/port to the backend */
+int
+netback_accel_filter_check_add(struct netback_accel *bend,
+ struct netback_accel_filter_spec *filt)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ struct netback_accel_filter_spec *fs;
+ unsigned filter_index;
+ unsigned long flags;
+ int rc, recycling = 0;
+ cuckoo_hash_ip_key filter_key, evict_key;
+
+ BUG_ON(filt->proto != IPPROTO_TCP && filt->proto != IPPROTO_UDP);
+
+ DPRINTK("Will add %s filter for dst ip %08x and dst port %d\n",
+ (filt->proto == IPPROTO_TCP) ? "TCP" : "UDP",
+ be32_to_cpu(filt->destip_be), be16_to_cpu(filt->destport_be));
+
+ spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+ /*
+ * Check to see if we're already filtering this IP address and
+ * port. Happens if you insert a filter mid-stream as there
+ * are many packets backed up to be delivered to dom0 already
+ */
+ make_filter_key(&filter_key, filt);
+ if (cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table,
+ (cuckoo_hash_key *)(&filter_key),
+ &filter_index)) {
+ DPRINTK("Found matching filter %d already in table\n",
+ filter_index);
+ rc = -1;
+ goto out;
+ }
+
+ if (accel_hw_priv->free_filters == 0) {
+ filter_index = get_victim_filter(bend);
+ recycling = 1;
+ } else {
+ filter_index = __ffs(accel_hw_priv->free_filters);
+ clear_bit(filter_index, &accel_hw_priv->free_filters);
+ }
+
+ fs = &accel_hw_priv->fspecs[filter_index];
+
+ if (recycling) {
+ DPRINTK("Removing filter index %d handle %p\n", filter_index,
+ fs->filter_handle);
+
+ if ((rc = efx_vi_filter_stop(accel_hw_priv->efx_vih,
+ fs->filter_handle)) != 0) {
+ EPRINTK("Couldn't clear NIC filter table entry %d\n", rc);
+ }
+
+ make_filter_key(&evict_key, fs);
+ if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
+ (cuckoo_hash_key *)&evict_key)) {
+ EPRINTK("Couldn't find filter to remove from table\n");
+ BUG();
+ }
+ NETBACK_ACCEL_STATS_OP(bend->stats.num_filters--);
+ }
+
+ /* Update the filter spec with new details */
+ *fs = *filt;
+
+ if ((rc = cuckoo_hash_add(&accel_hw_priv->filter_hash_table,
+ (cuckoo_hash_key *)&filter_key, filter_index,
+ 1)) != 0) {
+ EPRINTK("Error (%d) adding filter to table\n", rc);
+ accel_hw_priv->free_filters |= (1 << filter_index);
+ goto out;
+ }
+
+ rc = efx_vi_filter(accel_hw_priv->efx_vih, filt->proto, filt->destip_be,
+ filt->destport_be,
+ (struct filter_resource_t **)&fs->filter_handle);
+
+ if (rc != 0) {
+ EPRINTK("Hardware filter insertion failed. Error %d\n", rc);
+ accel_hw_priv->free_filters |= (1 << filter_index);
+ cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
+ (cuckoo_hash_key *)&filter_key);
+ rc = -1;
+ goto out;
+ }
+
+ NETBACK_ACCEL_STATS_OP(bend->stats.num_filters++);
+
+ VPRINTK("%s: success index %d handle %p\n", __FUNCTION__, filter_index,
+ fs->filter_handle);
+
+ rc = filter_index;
+ out:
+ spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+ return rc;
+}
+
+
+/* Remove a filter entry for the specific device and IP/port */
+static void netback_accel_filter_remove(struct netback_accel *bend,
+ int filter_index)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+ BUG_ON(accel_hw_priv->free_filters & (1 << filter_index));
+ netback_accel_free_filter(accel_hw_priv, filter_index);
+ accel_hw_priv->free_filters |= (1 << filter_index);
+}
+
+
+/* Remove a filter entry for the specific device and IP/port */
+void netback_accel_filter_remove_spec(struct netback_accel *bend,
+ struct netback_accel_filter_spec *filt)
+{
+ struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+ unsigned filter_found;
+ unsigned long flags;
+ cuckoo_hash_ip_key filter_key;
+ struct netback_accel_filter_spec *fs;
+
+ if (filt->proto == IPPROTO_TCP) {
+ DPRINTK("Remove TCP filter for dst ip %08x and dst port %d\n",
+ be32_to_cpu(filt->destip_be),
+ be16_to_cpu(filt->destport_be));
+ } else if (filt->proto == IPPROTO_UDP) {
+ DPRINTK("Remove UDP filter for dst ip %08x and dst port %d\n",
+ be32_to_cpu(filt->destip_be),
+ be16_to_cpu(filt->destport_be));
+ } else {
+ /*
+ * This could be provoked by an evil frontend, so can't
+ * BUG(), but harmless as it should fail tests below
+ */
+ DPRINTK("Non-TCP/UDP filter dst ip %08x and dst port %d\n",
+ be32_to_cpu(filt->destip_be),
+ be16_to_cpu(filt->destport_be));
+ }
+
+ spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+
+ make_filter_key(&filter_key, filt);
+ if (!cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table,
+ (cuckoo_hash_key *)(&filter_key),
+ &filter_found)) {
+ EPRINTK("Couldn't find matching filter already in table\n");
+ goto out;
+ }
+
+ /* Do a full check to make sure we've not had a hash collision */
+ fs = &accel_hw_priv->fspecs[filter_found];
+ if (fs->destip_be == filt->destip_be &&
+ fs->destport_be == filt->destport_be &&
+ fs->proto == filt->proto &&
+ !memcmp(fs->mac, filt->mac, ETH_ALEN)) {
+ netback_accel_filter_remove(bend, filter_found);
+ } else {
+ EPRINTK("Entry in hash table does not match filter spec\n");
+ goto out;
+ }
+
+ out:
+ spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_SOLARFLARE_H
+#define NETBACK_ACCEL_SOLARFLARE_H
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+
+#include "driverlink_api.h"
+
+#define MAX_NICS 5
+#define MAX_PORTS 2
+
+
+extern int netback_accel_sf_init(void);
+extern void netback_accel_sf_shutdown(void);
+extern int netback_accel_sf_hwtype(struct netback_accel *bend);
+
+extern int netback_accel_sf_char_init(void);
+extern void netback_accel_sf_char_shutdown(void);
+
+extern int netback_accel_setup_vnic_hw(struct netback_accel *bend);
+extern void netback_accel_shutdown_vnic_hw(struct netback_accel *bend);
+
+extern int netback_accel_add_buffers(struct netback_accel *bend, int pages,
+ int log2_pages, u32 *grants,
+ u32 *buf_addr_out);
+extern int netback_accel_remove_buffers(struct netback_accel *bend);
+
+
+/* Add a filter for the specified IP/port to the backend */
+extern int
+netback_accel_filter_check_add(struct netback_accel *bend,
+ struct netback_accel_filter_spec *filt);
+/* Remove a filter entry for the specific device and IP/port */
+extern
+void netback_accel_filter_remove_index(struct netback_accel *bend,
+ int filter_index);
+extern
+void netback_accel_filter_remove_spec(struct netback_accel *bend,
+ struct netback_accel_filter_spec *filt);
+
+/* This is designed to look a bit like a skb */
+struct netback_pkt_buf {
+ union {
+ unsigned char *raw;
+ } mac;
+ union {
+ struct iphdr *iph;
+ struct arphdr *arph;
+ unsigned char *raw;
+ } nh;
+ int protocol;
+};
+
+/*! \brief Handle a received packet: insert fast path filters as necessary
+ * \param skb The packet buffer
+ */
+extern void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv);
+
+/*! \brief Handle a transmitted packet: update fast path filters as necessary
+ * \param skb The packet buffer
+ */
+extern void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv);
+
+#endif /* NETBACK_ACCEL_SOLARFLARE_H */
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+#include <linux/mutex.h>
+
+/* drivers/xen/netback/common.h */
+#include "common.h"
+
+#include "accel.h"
+#include "accel_solarflare.h"
+#include "accel_util.h"
+
+#define NODENAME_PATH_FMT "backend/vif/%d/%d"
+
+#define NETBACK_ACCEL_FROM_XENBUS_DEVICE(_dev) (struct netback_accel *) \
+ ((struct backend_info *)(_dev)->dev.driver_data)->netback_accel_priv
+
+/* List of all the bends currently in existence. */
+struct netback_accel *bend_list = NULL;
+DEFINE_MUTEX(bend_list_mutex);
+
+/* Put in bend_list. Must hold bend_list_mutex */
+static void link_bend(struct netback_accel *bend)
+{
+ bend->next_bend = bend_list;
+ bend_list = bend;
+}
+
+/* Remove from bend_list, Must hold bend_list_mutex */
+static void unlink_bend(struct netback_accel *bend)
+{
+ struct netback_accel *tmp = bend_list;
+ struct netback_accel *prev = NULL;
+ while (tmp != NULL) {
+ if (tmp == bend) {
+ if (prev != NULL)
+ prev->next_bend = bend->next_bend;
+ else
+ bend_list = bend->next_bend;
+ return;
+ }
+ prev = tmp;
+ tmp = tmp->next_bend;
+ }
+}
+
+
+/* Demultiplex a message IRQ from the frontend driver. */
+static irqreturn_t msgirq_from_frontend(int irq, void *context)
+{
+ struct xenbus_device *dev = context;
+ struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+ VPRINTK("irq %d from device %s\n", irq, dev->nodename);
+ schedule_work(&bend->handle_msg);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ * Demultiplex an IRQ from the frontend driver. This is never used
+ * functionally, but we need it to pass to the bind function, and may
+ * get called spuriously
+ */
+static irqreturn_t netirq_from_frontend(int irq, void *context)
+{
+ VPRINTK("netirq %d from device %s\n", irq,
+ ((struct xenbus_device *)context)->nodename);
+
+ return IRQ_HANDLED;
+}
+
+
+/* Read the limits values of the xenbus structure. */
+static
+void cfg_hw_quotas(struct xenbus_device *dev, struct netback_accel *bend)
+{
+ int err = xenbus_gather
+ (XBT_NIL, dev->nodename,
+ "limits/max-filters", "%d", &bend->quotas.max_filters,
+ "limits/max-buf-pages", "%d", &bend->quotas.max_buf_pages,
+ "limits/max-mcasts", "%d", &bend->quotas.max_mcasts,
+ NULL);
+ if (err) {
+ /*
+ * TODO what if they have previously been set by the
+ * user? This will overwrite with defaults. Maybe
+ * not what we want to do, but useful in startup
+ * case
+ */
+ DPRINTK("Failed to read quotas from xenbus, using defaults\n");
+ bend->quotas.max_filters = NETBACK_ACCEL_DEFAULT_MAX_FILTERS;
+ bend->quotas.max_buf_pages = sfc_netback_max_pages;
+ bend->quotas.max_mcasts = NETBACK_ACCEL_DEFAULT_MAX_MCASTS;
+ }
+
+ return;
+}
+
+
+static void bend_config_accel_change(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ struct netback_accel *bend;
+
+ bend = container_of(watch, struct netback_accel, config_accel_watch);
+
+ mutex_lock(&bend->bend_mutex);
+ if (bend->config_accel_watch.node != NULL) {
+ struct xenbus_device *dev =
+ (struct xenbus_device *)bend->hdev_data;
+ DPRINTK("Watch matched, got dev %p otherend %p\n",
+ dev, dev->otherend);
+ if(!xenbus_exists(XBT_NIL, watch->node, "")) {
+ DPRINTK("Ignoring watch as otherend seems invalid\n");
+ goto out;
+ }
+
+ cfg_hw_quotas(dev, bend);
+ }
+ out:
+ mutex_unlock(&bend->bend_mutex);
+ return;
+}
+
+
+/*
+ * Setup watch on "limits" in the backend vif info to know when
+ * configuration has been set
+ */
+static int setup_config_accel_watch(struct xenbus_device *dev,
+ struct netback_accel *bend)
+{
+ int err;
+
+ VPRINTK("Setting watch on %s/%s\n", dev->nodename, "limits");
+
+ err = xenbus_watch_path2(dev, dev->nodename, "limits",
+ &bend->config_accel_watch,
+ bend_config_accel_change);
+
+ if (err) {
+ EPRINTK("%s: Failed to register xenbus watch: %d\n",
+ __FUNCTION__, err);
+ bend->config_accel_watch.node = NULL;
+ return err;
+ }
+ return 0;
+}
+
+
+static int
+cfg_frontend_info(struct xenbus_device *dev, struct netback_accel *bend,
+ int *grants)
+{
+ /* Get some info from xenbus on the event channel and shmem grant */
+ int err = xenbus_gather(XBT_NIL, dev->otherend,
+ "accel-msg-channel", "%u", &bend->msg_channel,
+ "accel-ctrl-page", "%d", &(grants[0]),
+ "accel-msg-page", "%d", &(grants[1]),
+ "accel-net-channel", "%u", &bend->net_channel,
+ NULL);
+ if (err)
+ EPRINTK("failed to read event channels or shmem grant: %d\n",
+ err);
+ else
+ DPRINTK("got event chan %d and net chan %d from frontend\n",
+ bend->msg_channel, bend->net_channel);
+ return err;
+}
+
+
+/* Setup all the comms needed to chat with the front end driver */
+static int setup_vnic(struct xenbus_device *dev)
+{
+ struct netback_accel *bend;
+ int grants[2], err, msgs_per_queue;
+
+ bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+
+ err = cfg_frontend_info(dev, bend, grants);
+ if (err)
+ goto fail1;
+
+ /*
+ * If we get here, both frontend Connected and configuration
+ * options available. All is well.
+ */
+
+ /* Get the hardware quotas for the VNIC in question. */
+ cfg_hw_quotas(dev, bend);
+
+ /* Set up the deferred work handlers */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+ INIT_WORK(&bend->handle_msg,
+ netback_accel_msg_rx_handler);
+#else
+ INIT_WORK(&bend->handle_msg,
+ netback_accel_msg_rx_handler,
+ (void*)bend);
+#endif
+
+ /* Request the frontend mac */
+ err = net_accel_xen_net_read_mac(dev, bend->mac);
+ if (err)
+ goto fail2;
+
+ /* Set up the shared page. */
+ bend->shared_page = net_accel_map_grants_contig(dev, grants, 2,
+ &bend->sh_pages_unmap);
+
+ if (bend->shared_page == NULL) {
+ EPRINTK("failed to map shared page for %s\n", dev->otherend);
+ err = -ENOMEM;
+ goto fail2;
+ }
+
+ /* Initialise the shared page(s) used for comms */
+ net_accel_msg_init_page(bend->shared_page, PAGE_SIZE,
+ bend->net_dev->flags & IFF_UP);
+
+ msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
+
+ net_accel_msg_init_queue
+ (&bend->to_domU, &bend->shared_page->queue0,
+ (struct net_accel_msg *)((__u8*)bend->shared_page + PAGE_SIZE),
+ msgs_per_queue);
+
+ net_accel_msg_init_queue
+ (&bend->from_domU, &bend->shared_page->queue1,
+ (struct net_accel_msg *)((__u8*)bend->shared_page +
+ (3 * PAGE_SIZE / 2)),
+ msgs_per_queue);
+
+ /* Bind the message event channel to a handler
+ *
+ * Note that we will probably get a spurious interrupt when we
+ * do this, so it must not be done until we have set up
+ * everything we need to handle it.
+ */
+ err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
+ bend->msg_channel,
+ msgirq_from_frontend,
+ 0,
+ "netback_accel",
+ dev);
+ if (err < 0) {
+ EPRINTK("failed to bind event channel: %d\n", err);
+ goto fail3;
+ }
+ else
+ bend->msg_channel_irq = err;
+
+ /* TODO: No need to bind this evtchn to an irq. */
+ err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
+ bend->net_channel,
+ netirq_from_frontend,
+ 0,
+ "netback_accel",
+ dev);
+ if (err < 0) {
+ EPRINTK("failed to bind net channel: %d\n", err);
+ goto fail4;
+ }
+ else
+ bend->net_channel_irq = err;
+
+ /*
+ * Grab ourselves an entry in the forwarding hash table. We do
+ * this now so we don't have the embarassmesnt of sorting out
+ * an allocation failure while at IRQ. Because we pass NULL as
+ * the context, the actual hash lookup will succeed for this
+ * NIC, but the check for somewhere to forward to will
+ * fail. This is necessary to prevent forwarding before
+ * hardware resources are set up
+ */
+ err = netback_accel_fwd_add(bend->mac, NULL, bend->fwd_priv);
+ if (err) {
+ EPRINTK("failed to add to fwd hash table\n");
+ goto fail5;
+ }
+
+ /*
+ * Say hello to frontend. Important to do this straight after
+ * obtaining the message queue as otherwise we are vulnerable
+ * to an evil frontend sending a HELLO-REPLY before we've sent
+ * the HELLO and confusing us
+ */
+ netback_accel_msg_tx_hello(bend, NET_ACCEL_MSG_VERSION);
+ return 0;
+
+ fail5:
+ unbind_from_irqhandler(bend->net_channel_irq, dev);
+ fail4:
+ unbind_from_irqhandler(bend->msg_channel_irq, dev);
+ fail3:
+ net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
+ bend->shared_page = NULL;
+ bend->sh_pages_unmap = NULL;
+ fail2:
+ fail1:
+ return err;
+}
+
+
+static int read_nicname(struct xenbus_device *dev, struct netback_accel *bend)
+{
+ int len;
+
+ /* nic name used to select interface used for acceleration */
+ bend->nicname = xenbus_read(XBT_NIL, dev->nodename, "accel", &len);
+ if (IS_ERR(bend->nicname))
+ return PTR_ERR(bend->nicname);
+
+ return 0;
+}
+
+static const char *frontend_name = "sfc_netfront";
+
+static int publish_frontend_name(struct xenbus_device *dev)
+{
+ struct xenbus_transaction tr;
+ int err;
+
+ /* Publish the name of the frontend driver */
+ do {
+ err = xenbus_transaction_start(&tr);
+ if (err != 0) {
+ EPRINTK("%s: transaction start failed\n", __FUNCTION__);
+ return err;
+ }
+ err = xenbus_printf(tr, dev->nodename, "accel-frontend",
+ "%s", frontend_name);
+ if (err != 0) {
+ EPRINTK("%s: xenbus_printf failed\n", __FUNCTION__);
+ xenbus_transaction_end(tr, 1);
+ return err;
+ }
+ err = xenbus_transaction_end(tr, 0);
+ } while (err == -EAGAIN);
+
+ if (err != 0) {
+ EPRINTK("failed to end frontend name transaction\n");
+ return err;
+ }
+ return 0;
+}
+
+
+static int unpublish_frontend_name(struct xenbus_device *dev)
+{
+ struct xenbus_transaction tr;
+ int err;
+
+ do {
+ err = xenbus_transaction_start(&tr);
+ if (err != 0)
+ break;
+ err = xenbus_rm(tr, dev->nodename, "accel-frontend");
+ if (err != 0) {
+ xenbus_transaction_end(tr, 1);
+ break;
+ }
+ err = xenbus_transaction_end(tr, 0);
+ } while (err == -EAGAIN);
+
+ return err;
+}
+
+
+static void cleanup_vnic(struct netback_accel *bend)
+{
+ struct xenbus_device *dev;
+
+ dev = (struct xenbus_device *)bend->hdev_data;
+
+ DPRINTK("%s: bend %p dev %p\n", __FUNCTION__, bend, dev);
+
+ DPRINTK("%s: Remove %p's mac from fwd table...\n",
+ __FUNCTION__, bend);
+ netback_accel_fwd_remove(bend->mac, bend->fwd_priv);
+
+ /* Free buffer table allocations */
+ netback_accel_remove_buffers(bend);
+
+ DPRINTK("%s: Release hardware resources...\n", __FUNCTION__);
+ if (bend->accel_shutdown)
+ bend->accel_shutdown(bend);
+
+ if (bend->net_channel_irq) {
+ unbind_from_irqhandler(bend->net_channel_irq, dev);
+ bend->net_channel_irq = 0;
+ }
+
+ if (bend->msg_channel_irq) {
+ unbind_from_irqhandler(bend->msg_channel_irq, dev);
+ bend->msg_channel_irq = 0;
+ }
+
+ if (bend->sh_pages_unmap) {
+ DPRINTK("%s: Unmap grants %p\n", __FUNCTION__,
+ bend->sh_pages_unmap);
+ net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
+ bend->sh_pages_unmap = NULL;
+ bend->shared_page = NULL;
+ }
+}
+
+
+/*************************************************************************/
+
+/*
+ * The following code handles accelstate changes between the frontend
+ * and the backend. It calls setup_vnic and cleanup_vnic in matching
+ * pairs in response to transitions.
+ *
+ * Valid state transitions for Dom0 are as follows:
+ *
+ * Closed->Init on probe or in response to Init from domU
+ * Closed->Closing on error/remove
+ *
+ * Init->Connected in response to Connected from domU
+ * Init->Closing on error/remove or in response to Closing from domU
+ *
+ * Connected->Closing on error/remove or in response to Closing from domU
+ *
+ * Closing->Closed in response to Closed from domU
+ *
+ */
+
+
+static void netback_accel_frontend_changed(struct xenbus_device *dev,
+ XenbusState frontend_state)
+{
+ struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+ XenbusState backend_state;
+
+ DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
+ __FUNCTION__, xenbus_strstate(bend->frontend_state),
+ xenbus_strstate(frontend_state),dev->nodename, dev->otherend);
+
+ /*
+ * Ignore duplicate state changes. This can happen if the
+ * frontend changes state twice in quick succession and the
+ * first watch fires in the backend after the second
+ * transition has completed.
+ */
+ if (bend->frontend_state == frontend_state)
+ return;
+
+ bend->frontend_state = frontend_state;
+ backend_state = bend->backend_state;
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (backend_state == XenbusStateClosed &&
+ !bend->removing)
+ backend_state = XenbusStateInitialising;
+ break;
+
+ case XenbusStateConnected:
+ if (backend_state == XenbusStateInitialising) {
+ if (!bend->vnic_is_setup &&
+ setup_vnic(dev) == 0) {
+ bend->vnic_is_setup = 1;
+ backend_state = XenbusStateConnected;
+ } else {
+ backend_state = XenbusStateClosing;
+ }
+ }
+ break;
+
+ case XenbusStateInitWait:
+ case XenbusStateInitialised:
+ default:
+ DPRINTK("Unknown state %s (%d) from frontend.\n",
+ xenbus_strstate(frontend_state), frontend_state);
+ /* Unknown state. Fall through. */
+ case XenbusStateClosing:
+ if (backend_state != XenbusStateClosed)
+ backend_state = XenbusStateClosing;
+
+ /*
+ * The bend will now persist (with watches active) in
+ * case the frontend comes back again, eg. after
+ * frontend module reload or suspend/resume
+ */
+
+ break;
+
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ if (bend->vnic_is_setup) {
+ bend->vnic_is_setup = 0;
+ cleanup_vnic(bend);
+ }
+
+ if (backend_state == XenbusStateClosing)
+ backend_state = XenbusStateClosed;
+ break;
+ }
+
+ if (backend_state != bend->backend_state) {
+ DPRINTK("Switching from state %s (%d) to %s (%d)\n",
+ xenbus_strstate(bend->backend_state),
+ bend->backend_state,
+ xenbus_strstate(backend_state), backend_state);
+ bend->backend_state = backend_state;
+ net_accel_update_state(dev, backend_state);
+ }
+
+ wake_up(&bend->state_wait_queue);
+}
+
+
+/* accelstate on the frontend's xenbus node has changed */
+static void bend_domu_accel_change(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int state;
+ struct netback_accel *bend;
+
+ bend = container_of(watch, struct netback_accel, domu_accel_watch);
+ if (bend->domu_accel_watch.node != NULL) {
+ struct xenbus_device *dev =
+ (struct xenbus_device *)bend->hdev_data;
+ VPRINTK("Watch matched, got dev %p otherend %p\n",
+ dev, dev->otherend);
+ /*
+ * dev->otherend != NULL check to protect against
+ * watch firing when domain goes away and we haven't
+ * yet cleaned up
+ */
+ if (!dev->otherend ||
+ !xenbus_exists(XBT_NIL, watch->node, "") ||
+ strncmp(dev->otherend, vec[XS_WATCH_PATH],
+ strlen(dev->otherend))) {
+ DPRINTK("Ignoring watch as otherend seems invalid\n");
+ return;
+ }
+
+ mutex_lock(&bend->bend_mutex);
+
+ xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d",
+ &state);
+ netback_accel_frontend_changed(dev, state);
+
+ mutex_unlock(&bend->bend_mutex);
+ }
+}
+
+/* Setup watch on frontend's accelstate */
+static int setup_domu_accel_watch(struct xenbus_device *dev,
+ struct netback_accel *bend)
+{
+ int err;
+
+ VPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
+
+ err = xenbus_watch_path2(dev, dev->otherend, "accelstate",
+ &bend->domu_accel_watch,
+ bend_domu_accel_change);
+ if (err) {
+ EPRINTK("%s: Failed to register xenbus watch: %d\n",
+ __FUNCTION__, err);
+ goto fail;
+ }
+ return 0;
+ fail:
+ bend->domu_accel_watch.node = NULL;
+ return err;
+}
+
+
+int netback_accel_probe(struct xenbus_device *dev)
+{
+ struct netback_accel *bend;
+ struct backend_info *binfo;
+ int err;
+
+ DPRINTK("%s: passed device %s\n", __FUNCTION__, dev->nodename);
+
+ /* Allocate structure to store all our state... */
+ bend = kzalloc(sizeof(struct netback_accel), GFP_KERNEL);
+ if (bend == NULL) {
+ DPRINTK("%s: no memory for bend\n", __FUNCTION__);
+ return -ENOMEM;
+ }
+
+ mutex_init(&bend->bend_mutex);
+
+ mutex_lock(&bend->bend_mutex);
+
+ /* ...and store it where we can get at it */
+ binfo = (struct backend_info *) dev->dev.driver_data;
+ binfo->netback_accel_priv = bend;
+ /* And vice-versa */
+ bend->hdev_data = dev;
+
+ DPRINTK("%s: Adding bend %p to list\n", __FUNCTION__, bend);
+
+ init_waitqueue_head(&bend->state_wait_queue);
+ bend->vnic_is_setup = 0;
+ bend->frontend_state = XenbusStateUnknown;
+ bend->backend_state = XenbusStateClosed;
+ bend->removing = 0;
+
+ sscanf(dev->nodename, NODENAME_PATH_FMT, &bend->far_end,
+ &bend->vif_num);
+
+ err = read_nicname(dev, bend);
+ if (err) {
+ /*
+ * Technically not an error, just means we're not
+ * supposed to accelerate this
+ */
+ DPRINTK("failed to get device name\n");
+ goto fail_nicname;
+ }
+
+ /*
+ * Look up the device name in the list of NICs provided by
+ * driverlink to get the hardware type.
+ */
+ err = netback_accel_sf_hwtype(bend);
+ if (err) {
+ /*
+ * Technically not an error, just means we're not
+ * supposed to accelerate this, probably belongs to
+ * some other backend
+ */
+ DPRINTK("failed to match device name\n");
+ goto fail_init_type;
+ }
+
+ err = publish_frontend_name(dev);
+ if (err)
+ goto fail_publish;
+
+ err = netback_accel_debugfs_create(bend);
+ if (err)
+ goto fail_debugfs;
+
+ mutex_unlock(&bend->bend_mutex);
+
+ err = setup_config_accel_watch(dev, bend);
+ if (err)
+ goto fail_config_watch;
+
+ err = setup_domu_accel_watch(dev, bend);
+ if (err)
+ goto fail_domu_watch;
+
+ /*
+ * Indicate to the other end that we're ready to start unless
+ * the watch has already fired.
+ */
+ mutex_lock(&bend->bend_mutex);
+ if (bend->backend_state == XenbusStateClosed) {
+ bend->backend_state = XenbusStateInitialising;
+ net_accel_update_state(dev, XenbusStateInitialising);
+ }
+ mutex_unlock(&bend->bend_mutex);
+
+ mutex_lock(&bend_list_mutex);
+ link_bend(bend);
+ mutex_unlock(&bend_list_mutex);
+
+ return 0;
+
+fail_domu_watch:
+
+ unregister_xenbus_watch(&bend->config_accel_watch);
+ kfree(bend->config_accel_watch.node);
+fail_config_watch:
+
+ /*
+ * Flush the scheduled work queue before freeing bend to get
+ * rid of any pending netback_accel_msg_rx_handler()
+ */
+ flush_scheduled_work();
+
+ mutex_lock(&bend->bend_mutex);
+ net_accel_update_state(dev, XenbusStateUnknown);
+ netback_accel_debugfs_remove(bend);
+fail_debugfs:
+
+ unpublish_frontend_name(dev);
+fail_publish:
+
+ /* No need to reverse netback_accel_sf_hwtype. */
+fail_init_type:
+
+ kfree(bend->nicname);
+fail_nicname:
+ binfo->netback_accel_priv = NULL;
+ mutex_unlock(&bend->bend_mutex);
+ kfree(bend);
+ return err;
+}
+
+
+int netback_accel_remove(struct xenbus_device *dev)
+{
+ struct backend_info *binfo;
+ struct netback_accel *bend;
+ int frontend_state;
+
+ binfo = (struct backend_info *) dev->dev.driver_data;
+ bend = (struct netback_accel *) binfo->netback_accel_priv;
+
+ DPRINTK("%s: dev %p bend %p\n", __FUNCTION__, dev, bend);
+
+ BUG_ON(bend == NULL);
+
+ mutex_lock(&bend_list_mutex);
+ unlink_bend(bend);
+ mutex_unlock(&bend_list_mutex);
+
+ mutex_lock(&bend->bend_mutex);
+
+ /* Reject any requests to connect. */
+ bend->removing = 1;
+
+ /*
+ * Switch to closing to tell the other end that we're going
+ * away.
+ */
+ if (bend->backend_state != XenbusStateClosing) {
+ bend->backend_state = XenbusStateClosing;
+ net_accel_update_state(dev, XenbusStateClosing);
+ }
+
+ frontend_state = (int)XenbusStateUnknown;
+ xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d",
+ &frontend_state);
+
+ mutex_unlock(&bend->bend_mutex);
+
+ /*
+ * Wait until this end goes to the closed state. This happens
+ * in response to the other end going to the closed state.
+ * Don't bother doing this if the other end is already closed
+ * because if it is then there is nothing to do.
+ */
+ if (frontend_state != (int)XenbusStateClosed &&
+ frontend_state != (int)XenbusStateUnknown)
+ wait_event(bend->state_wait_queue,
+ bend->backend_state == XenbusStateClosed);
+
+ unregister_xenbus_watch(&bend->domu_accel_watch);
+ kfree(bend->domu_accel_watch.node);
+
+ unregister_xenbus_watch(&bend->config_accel_watch);
+ kfree(bend->config_accel_watch.node);
+
+ /*
+ * Flush the scheduled work queue before freeing bend to get
+ * rid of any pending netback_accel_msg_rx_handler()
+ */
+ flush_scheduled_work();
+
+ mutex_lock(&bend->bend_mutex);
+
+ /* Tear down the vnic if it was set up. */
+ if (bend->vnic_is_setup) {
+ bend->vnic_is_setup = 0;
+ cleanup_vnic(bend);
+ }
+
+ bend->backend_state = XenbusStateUnknown;
+ net_accel_update_state(dev, XenbusStateUnknown);
+
+ netback_accel_debugfs_remove(bend);
+
+ unpublish_frontend_name(dev);
+
+ kfree(bend->nicname);
+
+ binfo->netback_accel_priv = NULL;
+
+ mutex_unlock(&bend->bend_mutex);
+
+ kfree(bend);
+
+ return 0;
+}
+
+
+void netback_accel_shutdown_bends(void)
+{
+ mutex_lock(&bend_list_mutex);
+ /*
+ * I think we should have had a remove callback for all
+ * interfaces before being allowed to unload the module
+ */
+ BUG_ON(bend_list != NULL);
+ mutex_unlock(&bend_list_mutex);
+}
+
+
+void netback_accel_set_closing(struct netback_accel *bend)
+{
+
+ bend->backend_state = XenbusStateClosing;
+ net_accel_update_state((struct xenbus_device *)bend->hdev_data,
+ XenbusStateClosing);
+}
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Compatability layer. Provides definitions of fundamental
+ * types and definitions that are used throughout CI source
+ * code. It does not introduce any link time dependencies,
+ * or include any unnecessary system headers.
+ */
+/*! \cidoxg_include_ci */
+
+#ifndef __CI_COMPAT_H__
+#define __CI_COMPAT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <ci/compat/primitive.h>
+#include <ci/compat/sysdep.h>
+#include <ci/compat/utils.h>
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CI_COMPAT_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat */
+
+#ifndef __CI_COMPAT_GCC_H__
+#define __CI_COMPAT_GCC_H__
+
+
+#define CI_HAVE_INT64
+
+
+#if defined(__linux__) && defined(__KERNEL__)
+
+# include <linux/types.h>
+
+typedef __u64 ci_uint64;
+typedef __s64 ci_int64;
+# if BITS_PER_LONG == 32
+typedef __s32 ci_ptr_arith_t;
+typedef __u32 ci_uintptr_t;
+# else
+typedef __s64 ci_ptr_arith_t;
+typedef __u64 ci_uintptr_t;
+# endif
+
+
+/* it's not obvious to me why the below is wrong for x64_64, but
+ * gcc seems to complain on this platform
+ */
+# if defined(__ia64__)
+# define CI_PRId64 "ld"
+# define CI_PRIi64 "li"
+# define CI_PRIo64 "lo"
+# define CI_PRIu64 "lu"
+# define CI_PRIx64 "lx"
+# define CI_PRIX64 "lX"
+# else
+# define CI_PRId64 "lld"
+# define CI_PRIi64 "lli"
+# define CI_PRIo64 "llo"
+# define CI_PRIu64 "llu"
+# define CI_PRIx64 "llx"
+# define CI_PRIX64 "llX"
+# endif
+
+# define CI_PRId32 "d"
+# define CI_PRIi32 "i"
+# define CI_PRIo32 "o"
+# define CI_PRIu32 "u"
+# define CI_PRIx32 "x"
+# define CI_PRIX32 "X"
+
+#else
+
+# include <stdint.h>
+# include <inttypes.h>
+
+typedef uint64_t ci_uint64;
+typedef int64_t ci_int64;
+typedef intptr_t ci_ptr_arith_t;
+typedef uintptr_t ci_uintptr_t;
+
+# define CI_PRId64 PRId64
+# define CI_PRIi64 PRIi64
+# define CI_PRIo64 PRIo64
+# define CI_PRIu64 PRIu64
+# define CI_PRIx64 PRIx64
+# define CI_PRIX64 PRIX64
+
+# define CI_PRId32 PRId32
+# define CI_PRIi32 PRIi32
+# define CI_PRIo32 PRIo32
+# define CI_PRIu32 PRIu32
+# define CI_PRIx32 PRIx32
+# define CI_PRIX32 PRIX32
+
+#endif
+
+
+typedef ci_uint64 ci_fixed_descriptor_t;
+
+#define from_fixed_descriptor(desc) ((ci_uintptr_t)(desc))
+#define to_fixed_descriptor(desc) ((ci_fixed_descriptor_t)(ci_uintptr_t)(desc))
+
+
+#if __GNUC__ >= 3 && !defined(__cplusplus)
+/*
+** Checks that [p_mbr] has the same type as [&c_type::mbr_name].
+*/
+# define CI_CONTAINER(c_type, mbr_name, p_mbr) \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(__typeof__(&((c_type*)0)->mbr_name), \
+ __typeof__(p_mbr)), \
+ __CI_CONTAINER(c_type, mbr_name, p_mbr), (void)0)
+
+# define ci_restrict __restrict__
+#endif
+
+
+#if !defined(__KERNEL__) || defined(__unix__)
+#define CI_HAVE_NPRINTF 1
+#endif
+
+
+/* At what version was this introduced? */
+#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
+# define CI_LIKELY(t) __builtin_expect((t), 1)
+# define CI_UNLIKELY(t) __builtin_expect((t), 0)
+#endif
+
+/**********************************************************************
+ * Attributes
+ */
+#if __GNUC__ >= 3 && defined(NDEBUG)
+# define CI_HF __attribute__((visibility("hidden")))
+# define CI_HV __attribute__((visibility("hidden")))
+#else
+# define CI_HF
+# define CI_HV
+#endif
+
+#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
+# define ci_noinline static __attribute__((__noinline__))
+/* (Linux 2.6 defines its own "noinline", so we use the "__noinline__" form) */
+#else
+# define ci_noinline static
+#endif
+
+#define CI_ALIGN(x) __attribute__ ((aligned (x)))
+
+#define CI_PRINTF_LIKE(a,b) __attribute__((format(printf,a,b)))
+
+#endif /* __CI_COMPAT_GCC_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat */
+
+#ifndef __CI_COMPAT_GCC_X86_H__
+#define __CI_COMPAT_GCC_X86_H__
+
+/*
+** The facts:
+**
+** SSE sfence
+** SSE2 lfence, mfence, pause
+*/
+
+/*
+ Barriers to enforce ordering with respect to:
+
+ normal memory use: ci_wmb, ci_rmb, ci_wmb
+ IO bus access use: ci_wiob, ci_riob, ci_iob
+*/
+#if defined(__x86_64__)
+# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
+#else
+# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
+#endif
+
+/* ?? measure the impact of latency of sfence on a modern processor before we
+ take a decision on how to integrate with respect to writecombining */
+
+/* DJR: I don't think we need to add "memory" here. It means the asm does
+** something to memory that GCC doesn't understand. But all this does is
+** commit changes that GCC thinks have already happened. NB. GCC will not
+** reorder across a __volatile__ __asm__ anyway.
+*/
+#define ci_gcc_fence() __asm__ __volatile__ ("")
+
+#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+# define ci_x86_sfence() __asm__ __volatile__ ("sfence")
+# define ci_x86_lfence() __asm__ __volatile__ ("lfence")
+# define ci_x86_mfence() __asm__ __volatile__ ("mfence")
+#else
+# define ci_x86_sfence() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF8")
+# define ci_x86_lfence() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xE8")
+# define ci_x86_mfence() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF0")
+#endif
+
+
+/* x86 processors to P4 Xeon store in-order unless executing streaming
+ extensions or when using writecombining
+
+ Hence we do not define ci_wmb to use sfence by default. Requirement is that
+ we do not use writecombining to memory and any code which uses SSE
+ extensions must call sfence directly
+
+ We need to track non intel clones which may support out of order store.
+
+*/
+
+#if CI_CPU_OOS
+# if CI_CPU_HAS_SSE
+# define ci_wmb() ci_x86_sfence()
+# else
+# define ci_wmb() ci_x86_mb()
+# endif
+#else
+# define ci_wmb() ci_gcc_fence()
+#endif
+
+#if CI_CPU_HAS_SSE2
+# define ci_rmb() ci_x86_lfence()
+# define ci_mb() ci_x86_mfence()
+# define ci_riob() ci_x86_lfence()
+# define ci_wiob() ci_x86_sfence()
+# define ci_iob() ci_x86_mfence()
+#else
+# if CI_CPU_HAS_SSE
+# define ci_wiob() ci_x86_sfence()
+# else
+# define ci_wiob() ci_x86_mb()
+# endif
+# define ci_rmb() ci_x86_mb()
+# define ci_mb() ci_x86_mb()
+# define ci_riob() ci_x86_mb()
+# define ci_iob() ci_x86_mb()
+#endif
+
+typedef unsigned long ci_phys_addr_t;
+#define ci_phys_addr_fmt "%lx"
+
+#endif /* __CI_COMPAT_GCC_X86_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+/*! \cidoxg_include_ci_compat */
+
+#ifndef __CI_COMPAT_PRIMITIVE_H__
+#define __CI_COMPAT_PRIMITIVE_H__
+
+
+/**********************************************************************
+ * Primitive types.
+ */
+
+typedef unsigned char ci_uint8;
+typedef char ci_int8;
+
+typedef unsigned short ci_uint16;
+typedef short ci_int16;
+
+typedef unsigned int ci_uint32;
+typedef int ci_int32;
+
+/* 64-bit support is platform dependent. */
+
+
+/**********************************************************************
+ * Other fancy types.
+ */
+
+typedef ci_uint8 ci_octet;
+
+typedef enum {
+ CI_FALSE = 0,
+ CI_TRUE
+} ci_boolean_t;
+
+
+/**********************************************************************
+ * Some nice types you'd always assumed were standards.
+ * (Really, they are SYSV "standards".)
+ */
+
+#ifdef _WIN32
+typedef unsigned long ulong;
+typedef unsigned int uint;
+typedef char* caddr_t;
+#elif defined(__linux__) && defined(__KERNEL__)
+#include <linux/types.h>
+#elif defined(__linux__)
+#include <sys/types.h>
+#endif
+
+
+#endif /* __CI_COMPAT_PRIMITIVE_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat */
+
+#ifndef __CI_COMPAT_SYSDEP_H__
+#define __CI_COMPAT_SYSDEP_H__
+
+
+/**********************************************************************
+ * Platform definition fixups.
+ */
+
+#if defined(__ci_ul_driver__) && !defined(__ci_driver__)
+# define __ci_driver__
+#endif
+
+#if defined(__ci_driver__) && !defined(__ci_ul_driver__) && \
+ !defined(__KERNEL__)
+# define __KERNEL__
+#endif
+
+
+/**********************************************************************
+ * Sanity checks (no cheating!)
+ */
+
+#if defined(__KERNEL__) && !defined(__ci_driver__)
+# error Insane.
+#endif
+
+#if defined(__KERNEL__) && defined(__ci_ul_driver__)
+# error Madness.
+#endif
+
+#if defined(__unix__) && defined(_WIN32)
+# error Strange.
+#endif
+
+#if defined(__GNUC__) && defined(_MSC_VER)
+# error Crazy.
+#endif
+
+
+/**********************************************************************
+ * Compiler and processor dependencies.
+ */
+
+#if defined(__GNUC__)
+
+# include <ci/compat/gcc.h>
+
+# if defined(__i386__)
+# include <ci/compat/x86.h>
+# include <ci/compat/gcc_x86.h>
+# elif defined(__x86_64__)
+# include <ci/compat/x86_64.h>
+# include <ci/compat/gcc_x86.h>
+# elif defined(__PPC__)
+# include <ci/compat/ppc.h>
+# include <ci/compat/gcc_ppc.h>
+# elif defined(__ia64__)
+# include <ci/compat/ia64.h>
+# include <ci/compat/gcc_ia64.h>
+# else
+# error Unknown processor - GNU C
+# endif
+
+#elif defined(_MSC_VER)
+
+# include <ci/compat/msvc.h>
+
+# if defined(__i386__)
+# include <ci/compat/x86.h>
+# include <ci/compat/msvc_x86.h>
+# elif defined(__x86_64__)
+# include <ci/compat/x86_64.h>
+# include <ci/compat/msvc_x86_64.h>
+# else
+# error Unknown processor MSC
+# endif
+
+#elif defined(__PGI)
+
+# include <ci/compat/x86.h>
+# include <ci/compat/pg_x86.h>
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# if __INTEL_COMPILER >= 700
+# include <ci/compat/gcc.h>
+# include <ci/compat/x86.h>
+# include <ci/compat/gcc_x86.h>
+# else
+# error Old Intel compiler not supported. Yet.
+# endif
+
+#else
+# error Unknown compiler.
+#endif
+
+
+/**********************************************************************
+ * Misc stuff (that probably shouldn't be here).
+ */
+
+#ifdef __sun
+# ifdef __KERNEL__
+# define _KERNEL
+# define _SYSCALL32
+# ifdef _LP64
+# define _SYSCALL32_IMPL
+# endif
+# else
+# define _REENTRANT
+# endif
+#endif
+
+
+/**********************************************************************
+ * Defaults for anything left undefined.
+ */
+
+#ifndef CI_LIKELY
+# define CI_LIKELY(t) (t)
+# define CI_UNLIKELY(t) (t)
+#endif
+
+#ifndef ci_restrict
+# define ci_restrict
+#endif
+
+#ifndef ci_inline
+# define ci_inline static inline
+#endif
+
+#ifndef ci_noinline
+# define ci_noinline static
+#endif
+
+#endif /* __CI_COMPAT_SYSDEP_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Handy utility macros.
+ * \date 2003/01/17
+ */
+
+/*! \cidoxg_include_ci_compat */
+
+#ifndef __CI_COMPAT_UTILS_H__
+#define __CI_COMPAT_UTILS_H__
+
+
+/**********************************************************************
+ * Alignment -- [align] must be a power of 2.
+ **********************************************************************/
+
+ /*! Align forward onto next boundary. */
+
+#define CI_ALIGN_FWD(p, align) (((p)+(align)-1u) & ~((align)-1u))
+
+
+ /*! Align back onto prev boundary. */
+
+#define CI_ALIGN_BACK(p, align) ((p) & ~((align)-1u))
+
+
+ /*! How far to next boundary? */
+
+#define CI_ALIGN_NEEDED(p, align, signed_t) (-(signed_t)(p) & ((align)-1u))
+
+
+ /*! How far beyond prev boundary? */
+
+#define CI_OFFSET(p, align) ((p) & ((align)-1u))
+
+
+ /*! Does object fit in gap before next boundary? */
+
+#define CI_FITS(p, size, align, signed_t) \
+ (CI_ALIGN_NEEDED((p) + 1, (align), signed_t) + 1 >= (size))
+
+
+ /*! Align forward onto next boundary. */
+
+#define CI_PTR_ALIGN_FWD(p, align) \
+ ((char*) CI_ALIGN_FWD(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
+
+ /*! Align back onto prev boundary. */
+
+#define CI_PTR_ALIGN_BACK(p, align) \
+ ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
+
+ /*! How far to next boundary? */
+
+#define CI_PTR_ALIGN_NEEDED(p, align) \
+ CI_ALIGN_NEEDED(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)), \
+ ci_ptr_arith_t)
+
+ /*! How far to next boundary? NZ = not zero i.e. give align if on boundary */
+
+#define CI_PTR_ALIGN_NEEDED_NZ(p, align) \
+ ((align) - (((char*)p) - \
+ ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))))
+
+ /*! How far beyond prev boundary? */
+
+#define CI_PTR_OFFSET(p, align) \
+ CI_OFFSET(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)))
+
+
+ /* Same as CI_ALIGN_FWD and CI_ALIGN_BACK. */
+
+#define CI_ROUND_UP(i, align) (((i)+(align)-1u) & ~((align)-1u))
+
+#define CI_ROUND_DOWN(i, align) ((i) & ~((align)-1u))
+
+
+/**********************************************************************
+ * Byte-order
+ **********************************************************************/
+
+/* These are not flags. They are enumeration values for use with
+ * CI_MY_BYTE_ORDER. */
+#define CI_BIG_ENDIAN 1
+#define CI_LITTLE_ENDIAN 0
+
+/*
+** Note that these byte-swapping primitives may leave junk in bits above
+** the range they operate on.
+**
+** The CI_BSWAP_nn() routines require that bits above [nn] are zero. Use
+** CI_BSWAPM_nn(x) if this cannot be guaranteed.
+*/
+
+/* ?? May be able to improve on some of these with inline assembler on some
+** platforms.
+*/
+
+#define CI_BSWAP_16(v) ((((v) & 0xff) << 8) | ((v) >> 8))
+#define CI_BSWAPM_16(v) ((((v) & 0xff) << 8) | (((v) & 0xff00) >> 8))
+
+#define CI_BSWAP_32(v) (((v) >> 24) | \
+ (((v) & 0x00ff0000) >> 8) | \
+ (((v) & 0x0000ff00) << 8) | \
+ ((v) << 24))
+#define CI_BSWAPM_32(v) ((((v) & 0xff000000) >> 24) | \
+ (((v) & 0x00ff0000) >> 8) | \
+ (((v) & 0x0000ff00) << 8) | \
+ ((v) << 24))
+
+#define CI_BSWAP_64(v) (((v) >> 56) | \
+ (((v) & 0x00ff000000000000) >> 40) | \
+ (((v) & 0x0000ff0000000000) >> 24) | \
+ (((v) & 0x000000ff00000000) >> 8) | \
+ (((v) & 0x00000000ff000000) << 8) | \
+ (((v) & 0x0000000000ff0000) << 24) | \
+ (((v) & 0x000000000000ff00) << 40) | \
+ ((v) << 56))
+
+# define CI_BSWAPPED_16_IF(c,v) ((c) ? CI_BSWAP_16(v) : (v))
+# define CI_BSWAPPED_32_IF(c,v) ((c) ? CI_BSWAP_32(v) : (v))
+# define CI_BSWAPPED_64_IF(c,v) ((c) ? CI_BSWAP_64(v) : (v))
+# define CI_BSWAP_16_IF(c,v) do{ if((c)) (v) = CI_BSWAP_16(v); }while(0)
+# define CI_BSWAP_32_IF(c,v) do{ if((c)) (v) = CI_BSWAP_32(v); }while(0)
+# define CI_BSWAP_64_IF(c,v) do{ if((c)) (v) = CI_BSWAP_64(v); }while(0)
+
+#if (CI_MY_BYTE_ORDER == CI_LITTLE_ENDIAN)
+# define CI_BSWAP_LE16(v) (v)
+# define CI_BSWAP_LE32(v) (v)
+# define CI_BSWAP_LE64(v) (v)
+# define CI_BSWAP_BE16(v) CI_BSWAP_16(v)
+# define CI_BSWAP_BE32(v) CI_BSWAP_32(v)
+# define CI_BSWAP_BE64(v) CI_BSWAP_64(v)
+# define CI_BSWAPM_LE16(v) (v)
+# define CI_BSWAPM_LE32(v) (v)
+# define CI_BSWAPM_LE64(v) (v)
+# define CI_BSWAPM_BE16(v) CI_BSWAPM_16(v)
+# define CI_BSWAPM_BE32(v) CI_BSWAPM_32(v)
+#elif (CI_MY_BYTE_ORDER == CI_BIG_ENDIAN)
+# define CI_BSWAP_BE16(v) (v)
+# define CI_BSWAP_BE32(v) (v)
+# define CI_BSWAP_BE64(v) (v)
+# define CI_BSWAP_LE16(v) CI_BSWAP_16(v)
+# define CI_BSWAP_LE32(v) CI_BSWAP_32(v)
+# define CI_BSWAP_LE64(v) CI_BSWAP_64(v)
+# define CI_BSWAPM_BE16(v) (v)
+# define CI_BSWAPM_BE32(v) (v)
+# define CI_BSWAPM_BE64(v) (v)
+# define CI_BSWAPM_LE16(v) CI_BSWAPM_16(v)
+# define CI_BSWAPM_LE32(v) CI_BSWAPM_32(v)
+#else
+# error Bad endian.
+#endif
+
+
+/**********************************************************************
+ * Get pointer to struct from pointer to member
+ **********************************************************************/
+
+#define CI_MEMBER_OFFSET(c_type, mbr_name) \
+ ((ci_uint32) (ci_uintptr_t)(&((c_type*)0)->mbr_name))
+
+#define CI_MEMBER_SIZE(c_type, mbr_name) \
+ sizeof(((c_type*)0)->mbr_name)
+
+#define __CI_CONTAINER(c_type, mbr_name, p_mbr) \
+ ( (c_type*) ((char*)(p_mbr) - CI_MEMBER_OFFSET(c_type, mbr_name)) )
+
+#ifndef CI_CONTAINER
+# define CI_CONTAINER(t,m,p) __CI_CONTAINER(t,m,p)
+#endif
+
+
+/**********************************************************************
+ * Structure member initialiser.
+ **********************************************************************/
+
+#ifndef CI_STRUCT_MBR
+# define CI_STRUCT_MBR(name, val) .name = val
+#endif
+
+
+/**********************************************************************
+ * min / max
+ **********************************************************************/
+
+#define CI_MIN(x,y) (((x) < (y)) ? (x) : (y))
+#define CI_MAX(x,y) (((x) > (y)) ? (x) : (y))
+
+/**********************************************************************
+ * abs
+ **********************************************************************/
+
+#define CI_ABS(x) (((x) < 0) ? -(x) : (x))
+
+/**********************************************************************
+ * Conditional debugging
+ **********************************************************************/
+
+#ifdef NDEBUG
+# define CI_DEBUG(x)
+# define CI_NDEBUG(x) x
+# define CI_IF_DEBUG(y,n) (n)
+# define CI_DEBUG_ARG(x)
+#else
+# define CI_DEBUG(x) x
+# define CI_NDEBUG(x)
+# define CI_IF_DEBUG(y,n) (y)
+# define CI_DEBUG_ARG(x) ,x
+#endif
+
+#ifdef __KERNEL__
+#define CI_KERNEL_ARG(x) ,x
+#else
+#define CI_KERNEL_ARG(x)
+#endif
+
+#ifdef _WIN32
+# define CI_KERNEL_ARG_WIN(x) CI_KERNEL_ARG(x)
+# define CI_ARG_WIN(x) ,x
+#else
+# define CI_KERNEL_ARG_WIN(x)
+# define CI_ARG_WIN(x)
+#endif
+
+#ifdef __unix__
+# define CI_KERNEL_ARG_UNIX(x) CI_KERNEL_ARG(x)
+# define CI_ARG_UNIX(x) ,x
+#else
+# define CI_KERNEL_ARG_UNIX(x)
+# define CI_ARG_UNIX(x)
+#endif
+
+#ifdef __linux__
+# define CI_KERNEL_ARG_LINUX(x) CI_KERNEL_ARG(x)
+# define CI_ARG_LINUX(x) ,x
+#else
+# define CI_KERNEL_ARG_LINUX(x)
+# define CI_ARG_LINUX(x)
+#endif
+
+
+#endif /* __CI_COMPAT_UTILS_H__ */
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat */
+
+#ifndef __CI_COMPAT_X86_H__
+#define __CI_COMPAT_X86_H__
+
+
+#define CI_MY_BYTE_ORDER CI_LITTLE_ENDIAN
+
+#define CI_WORD_SIZE 4
+#define CI_PTR_SIZE 4
+
+#define CI_PAGE_SIZE 4096
+#define CI_PAGE_SHIFT 12
+#define CI_PAGE_MASK (~(CI_PAGE_SIZE - 1))
+
+#define CI_CPU_HAS_SSE 1 /* SSE extensions supported */
+#define CI_CPU_HAS_SSE2 0 /* SSE2 extensions supported */
+#define CI_CPU_OOS 0 /* CPU does out of order stores */
+
+
+#endif /* __CI_COMPAT_X86_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Arch stuff for AMD x86_64.
+ * \date 2004/08/17
+ */
+
+/*! \cidoxg_include_ci_compat */
+#ifndef __CI_COMPAT_X86_64_H__
+#define __CI_COMPAT_X86_64_H__
+
+
+#define CI_MY_BYTE_ORDER CI_LITTLE_ENDIAN
+
+#define CI_WORD_SIZE 8
+#define CI_PTR_SIZE 8
+
+#define CI_PAGE_SIZE 4096
+#define CI_PAGE_SHIFT 12
+#define CI_PAGE_MASK (~(CI_PAGE_SIZE - 1))
+
+#define CI_CPU_HAS_SSE 1 /* SSE extensions supported */
+
+/* SSE2 disabled while investigating BUG1060 */
+#define CI_CPU_HAS_SSE2 0 /* SSE2 extensions supported */
+#define CI_CPU_OOS 0 /* CPU does out of order stores */
+
+
+#endif /* __CI_COMPAT_X86_64_H__ */
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file contains public EFX VI API to Solarflare resource manager.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_DRIVER_RESOURCE_EFX_VI_H__
+#define __CI_DRIVER_RESOURCE_EFX_VI_H__
+
+/* Default size of event queue in the efx_vi resource. Copied from
+ * CI_CFG_NETIF_EVENTQ_SIZE */
+#define EFX_VI_EVENTQ_SIZE_DEFAULT 1024
+
+extern int efx_vi_eventq_size;
+
+/**************************************************************************
+ * efx_vi_state types, allocation and free
+ **************************************************************************/
+
+/*! Handle for refering to a efx_vi */
+struct efx_vi_state;
+
+/*!
+ * Allocate an efx_vi, including event queue and pt_endpoint
+ *
+ * \param vih_out Pointer to a handle that is set on success
+ * \param nic_index Index of NIC to apply this resource to
+ * \return Zero on success (and vih_out set), non-zero on failure.
+ */
+extern int
+efx_vi_alloc(struct efx_vi_state **vih_out, int nic_index);
+
+/*!
+ * Free a previously allocated efx_vi
+ *
+ * \param vih The handle of the efx_vi to free
+ */
+extern void
+efx_vi_free(struct efx_vi_state *vih);
+
+/*!
+ * Reset a previously allocated efx_vi
+ *
+ * \param vih The handle of the efx_vi to reset
+ */
+extern void
+efx_vi_reset(struct efx_vi_state *vih);
+
+/**************************************************************************
+ * efx_vi_eventq types and functions
+ **************************************************************************/
+
+/*!
+ * Register a function to receive callbacks when event queue timeouts
+ * or wakeups occur. Only one function per efx_vi can be registered
+ * at once.
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param callback The function to callback
+ * \param context An argument to pass to the callback function
+ * \return Zero on success, non-zero on failure.
+ */
+extern int
+efx_vi_eventq_register_callback(struct efx_vi_state *vih,
+ void (*callback)(void *context, int is_timeout),
+ void *context);
+
+/*!
+ * Remove the current eventq timeout or wakeup callback function
+ *
+ * \param vih The handle to identify the efx_vi
+ * \return Zero on success, non-zero on failure
+ */
+extern int
+efx_vi_eventq_kill_callback(struct efx_vi_state *vih);
+
+/**************************************************************************
+ * efx_vi_dma_map types and functions
+ **************************************************************************/
+
+/*!
+ * Handle for refering to a efx_vi
+ */
+struct efx_vi_dma_map_state;
+
+/*!
+ * Map a list of buffer pages so they are registered with the hardware
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param addrs An array of page pointers to map
+ * \param n_addrs Length of the page pointer array. Must be a power of two.
+ * \param dmh_out Set on success to a handle used to refer to this mapping
+ * \return Zero on success, non-zero on failure.
+ */
+extern int
+efx_vi_dma_map_pages(struct efx_vi_state *vih, struct page **pages,
+ int n_pages, struct efx_vi_dma_map_state **dmh_out);
+extern int
+efx_vi_dma_map_addrs(struct efx_vi_state *vih,
+ unsigned long long *dev_bus_addrs, int n_pages,
+ struct efx_vi_dma_map_state **dmh_out);
+
+/*!
+ * Unmap a previously mapped set of pages so they are no longer registered
+ * with the hardware.
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param dmh The handle to identify the dma mapping
+ */
+extern void
+efx_vi_dma_unmap_pages(struct efx_vi_state *vih,
+ struct efx_vi_dma_map_state *dmh);
+extern void
+efx_vi_dma_unmap_addrs(struct efx_vi_state *vih,
+ struct efx_vi_dma_map_state *dmh);
+
+/*!
+ * Retrieve the buffer address of the mapping
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param dmh The handle to identify the buffer mapping
+ * \return The buffer address on success, or zero on failure
+ */
+extern unsigned
+efx_vi_dma_get_map_addr(struct efx_vi_state *vih,
+ struct efx_vi_dma_map_state *dmh);
+
+/**************************************************************************
+ * efx_vi filter functions
+ **************************************************************************/
+
+#define EFX_VI_STATIC_FILTERS 32
+
+/*! Handle to refer to a filter instance */
+struct filter_resource_t;
+
+/*!
+ * Allocate and add a filter
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param protocol The protocol of the new filter: UDP or TCP
+ * \param ip_addr_be32 The local ip address of the filter
+ * \param port_le16 The local port of the filter
+ * \param fh_out Set on success to be a handle to refer to this filter
+ * \return Zero on success, non-zero on failure.
+ */
+extern int
+efx_vi_filter(struct efx_vi_state *vih, int protocol, unsigned ip_addr_be32,
+ int port_le16, struct filter_resource_t **fh_out);
+
+/*!
+ * Remove a filter and free resources associated with it
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param fh The handle to identify the filter
+ * \return Zero on success, non-zero on failure
+ */
+extern int
+efx_vi_filter_stop(struct efx_vi_state *vih, struct filter_resource_t *fh);
+
+/**************************************************************************
+ * efx_vi hw resources types and functions
+ **************************************************************************/
+
+/*! Constants for the type field in efx_vi_hw_resource */
+#define EFX_VI_HW_RESOURCE_TXDMAQ 0x0 /* PFN of TX DMA Q */
+#define EFX_VI_HW_RESOURCE_RXDMAQ 0x1 /* PFN of RX DMA Q */
+#define EFX_VI_HW_RESOURCE_TXBELL 0x2 /* PFN of TX Doorbell (EF1) */
+#define EFX_VI_HW_RESOURCE_RXBELL 0x3 /* PFN of RX Doorbell (EF1) */
+#define EFX_VI_HW_RESOURCE_EVQTIMER 0x4 /* Address of event q timer */
+
+/* Address of event q pointer (EF1) */
+#define EFX_VI_HW_RESOURCE_EVQPTR 0x5
+/* Address of register pointer (Falcon A) */
+#define EFX_VI_HW_RESOURCE_EVQRPTR 0x6
+/* Offset of register pointer (Falcon B) */
+#define EFX_VI_HW_RESOURCE_EVQRPTR_OFFSET 0x7
+/* Address of mem KVA */
+#define EFX_VI_HW_RESOURCE_EVQMEMKVA 0x8
+/* PFN of doorbell page (Falcon) */
+#define EFX_VI_HW_RESOURCE_BELLPAGE 0x9
+
+/*! How large an array to allocate for the get_() functions - smaller
+ than the total number of constants as some are mutually exclusive */
+#define EFX_VI_HW_RESOURCE_MAXSIZE 0x7
+
+/*! Constants for the mem_type field in efx_vi_hw_resource */
+#define EFX_VI_HW_RESOURCE_IOBUFFER 0 /* Host memory */
+#define EFX_VI_HW_RESOURCE_PERIPHERAL 1 /* Card memory/registers */
+
+/*!
+ * Data structure providing information on a hardware resource mapping
+ */
+struct efx_vi_hw_resource {
+ u8 type; /*!< What this resource represents */
+ u8 mem_type; /*!< What type of memory is it in, eg,
+ * host or iomem */
+ u8 more_to_follow; /*!< Is this part of a multi-region resource */
+ u32 length; /*!< Length of the resource in bytes */
+ unsigned long address; /*!< Address of this resource */
+};
+
+/*!
+ * Metadata concerning the list of hardware resource mappings
+ */
+struct efx_vi_hw_resource_metadata {
+ int version;
+ int evq_order;
+ int evq_offs;
+ int evq_capacity;
+ int instance;
+ unsigned rx_capacity;
+ unsigned tx_capacity;
+ int nic_arch;
+ int nic_revision;
+ char nic_variant;
+};
+
+/*!
+ * Obtain a list of hardware resource mappings, using virtual addresses
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param mdata Pointer to a structure to receive the metadata
+ * \param hw_res_array An array to receive the list of hardware resources
+ * \param length The length of hw_res_array. Updated on success to contain
+ * the number of entries in the supplied array that were used.
+ * \return Zero on success, non-zero on failure
+ */
+extern int
+efx_vi_hw_resource_get_virt(struct efx_vi_state *vih,
+ struct efx_vi_hw_resource_metadata *mdata,
+ struct efx_vi_hw_resource *hw_res_array,
+ int *length);
+
+/*!
+ * Obtain a list of hardware resource mappings, using physical addresses
+ *
+ * \param vih The handle to identify the efx_vi
+ * \param mdata Pointer to a structure to receive the metadata
+ * \param hw_res_array An array to receive the list of hardware resources
+ * \param length The length of hw_res_array. Updated on success to contain
+ * the number of entries in the supplied array that were used.
+ * \return Zero on success, non-zero on failure
+ */
+extern int
+efx_vi_hw_resource_get_phys(struct efx_vi_state *vih,
+ struct efx_vi_hw_resource_metadata *mdata,
+ struct efx_vi_hw_resource *hw_res_array,
+ int *length);
+
+#endif /* __CI_DRIVER_RESOURCE_EFX_VI_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides API of the efhw library which may be used both from
+ * the kernel and from the user-space code.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_COMMON_H__
+#define __CI_EFHW_COMMON_H__
+
+#include <ci/efhw/common_sysdep.h>
+
+enum efhw_arch {
+ EFHW_ARCH_FALCON,
+ EFHW_ARCH_SIENA,
+};
+
+typedef uint32_t efhw_buffer_addr_t;
+#define EFHW_BUFFER_ADDR_FMT "[ba:%"PRIx32"]"
+
+/*! Comment? */
+typedef union {
+ uint64_t u64;
+ struct {
+ uint32_t a;
+ uint32_t b;
+ } opaque;
+ struct {
+ uint32_t code;
+ uint32_t status;
+ } ev1002;
+} efhw_event_t;
+
+/* Flags for TX/RX queues */
+#define EFHW_VI_JUMBO_EN 0x01 /*! scatter RX over multiple desc */
+#define EFHW_VI_ISCSI_RX_HDIG_EN 0x02 /*! iscsi rx header digest */
+#define EFHW_VI_ISCSI_TX_HDIG_EN 0x04 /*! iscsi tx header digest */
+#define EFHW_VI_ISCSI_RX_DDIG_EN 0x08 /*! iscsi rx data digest */
+#define EFHW_VI_ISCSI_TX_DDIG_EN 0x10 /*! iscsi tx data digest */
+#define EFHW_VI_TX_PHYS_ADDR_EN 0x20 /*! TX physical address mode */
+#define EFHW_VI_RX_PHYS_ADDR_EN 0x40 /*! RX physical address mode */
+#define EFHW_VI_RM_WITH_INTERRUPT 0x80 /*! VI with an interrupt */
+#define EFHW_VI_TX_IP_CSUM_DIS 0x100 /*! enable ip checksum generation */
+#define EFHW_VI_TX_TCPUDP_CSUM_DIS 0x200 /*! enable tcp/udp checksum
+ generation */
+#define EFHW_VI_TX_TCPUDP_ONLY 0x400 /*! drop non-tcp/udp packets */
+
+/* Types of hardware filter */
+/* Each of these values implicitly selects scatter filters on B0 - or in
+ EFHW_IP_FILTER_TYPE_NOSCAT_B0_MASK if a non-scatter filter is required */
+#define EFHW_IP_FILTER_TYPE_UDP_WILDCARD (0) /* dest host only */
+#define EFHW_IP_FILTER_TYPE_UDP_FULL (1) /* dest host and port */
+#define EFHW_IP_FILTER_TYPE_TCP_WILDCARD (2) /* dest based filter */
+#define EFHW_IP_FILTER_TYPE_TCP_FULL (3) /* src filter */
+/* Same again, but with RSS (for B0 only) */
+#define EFHW_IP_FILTER_TYPE_UDP_WILDCARD_RSS_B0 (4)
+#define EFHW_IP_FILTER_TYPE_UDP_FULL_RSS_B0 (5)
+#define EFHW_IP_FILTER_TYPE_TCP_WILDCARD_RSS_B0 (6)
+#define EFHW_IP_FILTER_TYPE_TCP_FULL_RSS_B0 (7)
+
+#define EFHW_IP_FILTER_TYPE_FULL_MASK (0x1) /* Mask for full / wildcard */
+#define EFHW_IP_FILTER_TYPE_TCP_MASK (0x2) /* Mask for TCP type */
+#define EFHW_IP_FILTER_TYPE_RSS_B0_MASK (0x4) /* Mask for B0 RSS enable */
+#define EFHW_IP_FILTER_TYPE_NOSCAT_B0_MASK (0x8) /* Mask for B0 SCATTER dsbl */
+
+#define EFHW_IP_FILTER_TYPE_MASK (0xffff) /* Mask of types above */
+
+#define EFHW_IP_FILTER_BROADCAST (0x10000) /* driverlink filter
+ support */
+
+#endif /* __CI_EFHW_COMMON_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides version-independent Linux kernel API for
+ * userland-to-kernel interfaces.
+ * Only kernels >=2.6.9 are supported.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_COMMON_LINUX_H__
+#define __CI_EFHW_COMMON_LINUX_H__
+
+#include <linux/types.h>
+#include <linux/version.h>
+
+/* Dirty hack, but Linux kernel does not provide DMA_ADDR_T_FMT */
+#if BITS_PER_LONG == 64 || defined(CONFIG_HIGHMEM64G)
+#define DMA_ADDR_T_FMT "%llx"
+#else
+#define DMA_ADDR_T_FMT "%x"
+#endif
+
+/* Linux kernel also does not provide PRIx32... Sigh. */
+#define PRIx32 "x"
+#define PRIx64 "llx"
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+enum {
+ false = 0,
+ true = 1
+};
+
+typedef _Bool bool;
+#endif /* LINUX_VERSION_CODE < 2.6.19 */
+
+#endif /* __CI_EFHW_COMMON_LINUX_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides debug-related API for efhw library using Linux kernel
+ * primitives.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_DEBUG_LINUX_H__
+#define __CI_EFHW_DEBUG_LINUX_H__
+
+#define EFHW_PRINTK_PREFIX "[sfc efhw] "
+
+#define EFHW_PRINTK(level, fmt, ...) \
+ printk(level EFHW_PRINTK_PREFIX fmt "\n", __VA_ARGS__)
+
+/* Following macros should be used with non-zero format parameters
+ * due to __VA_ARGS__ limitations. Use "%s" with __FUNCTION__ if you can't
+ * find better parameters. */
+#define EFHW_ERR(fmt, ...) EFHW_PRINTK(KERN_ERR, fmt, __VA_ARGS__)
+#define EFHW_WARN(fmt, ...) EFHW_PRINTK(KERN_WARNING, fmt, __VA_ARGS__)
+#define EFHW_NOTICE(fmt, ...) EFHW_PRINTK(KERN_NOTICE, fmt, __VA_ARGS__)
+#if 0 && !defined(NDEBUG)
+#define EFHW_TRACE(fmt, ...) EFHW_PRINTK(KERN_DEBUG, fmt, __VA_ARGS__)
+#else
+#define EFHW_TRACE(fmt, ...)
+#endif
+
+#ifndef NDEBUG
+#define EFHW_ASSERT(cond) BUG_ON((cond) == 0)
+#define EFHW_DO_DEBUG(expr) expr
+#else
+#define EFHW_ASSERT(cond)
+#define EFHW_DO_DEBUG(expr)
+#endif
+
+#define EFHW_TEST(expr) \
+ do { \
+ if (unlikely(!(expr))) \
+ BUG(); \
+ } while (0)
+
+/* Build time asserts. We paste the line number into the type name
+ * so that the macro can be used more than once per file even if the
+ * compiler objects to multiple identical typedefs. Collisions
+ * between use in different header files is still possible. */
+#ifndef EFHW_BUILD_ASSERT
+#define __EFHW_BUILD_ASSERT_NAME(_x) __EFHW_BUILD_ASSERT_ILOATHECPP(_x)
+#define __EFHW_BUILD_ASSERT_ILOATHECPP(_x) __EFHW_BUILD_ASSERT__ ##_x
+#define EFHW_BUILD_ASSERT(e) \
+ typedef char __EFHW_BUILD_ASSERT_NAME(__LINE__)[(e) ? 1 : -1]
+#endif
+
+#endif /* __CI_EFHW_DEBUG_LINUX_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides some limits used in both kernel and userland code.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_EFAB_CONFIG_H__
+#define __CI_EFHW_EFAB_CONFIG_H__
+
+#define EFHW_MAX_NR_DEVS 5 /* max number of efhw devices supported */
+
+#endif /* __CI_EFHW_EFAB_CONFIG_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides struct efhw_nic and some related types.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_EFAB_TYPES_H__
+#define __CI_EFHW_EFAB_TYPES_H__
+
+#include <ci/efhw/efhw_config.h>
+#include <ci/efhw/hardware_sysdep.h>
+#include <ci/efhw/iopage_types.h>
+#include <ci/efhw/sysdep.h>
+
+/*--------------------------------------------------------------------
+ *
+ * hardware limits used in the types
+ *
+ *--------------------------------------------------------------------*/
+
+#define EFHW_KEVENTQ_MAX 8
+
+/*--------------------------------------------------------------------
+ *
+ * forward type declarations
+ *
+ *--------------------------------------------------------------------*/
+
+struct efhw_nic;
+
+/*--------------------------------------------------------------------
+ *
+ * Managed interface
+ *
+ *--------------------------------------------------------------------*/
+
+struct efhw_buffer_table_allocation{
+ unsigned base;
+ unsigned order;
+};
+
+struct eventq_resource_hardware {
+ /*!iobuffer allocated for eventq - can be larger than eventq */
+ efhw_iopages_t iobuff;
+ unsigned iobuff_off;
+ struct efhw_buffer_table_allocation buf_tbl_alloc;
+ int capacity; /*!< capacity of event queue */
+};
+
+/*--------------------------------------------------------------------
+ *
+ * event queues and event driven callbacks
+ *
+ *--------------------------------------------------------------------*/
+
+struct efhw_keventq {
+ volatile int lock;
+ caddr_t evq_base;
+ int32_t evq_ptr;
+ uint32_t evq_mask;
+ unsigned instance;
+ struct eventq_resource_hardware hw;
+ struct efhw_ev_handler *ev_handlers;
+};
+
+/**********************************************************************
+ * Portable HW interface. ***************************************
+ **********************************************************************/
+
+/*--------------------------------------------------------------------
+ *
+ * EtherFabric Functional units - configuration and control
+ *
+ *--------------------------------------------------------------------*/
+
+struct efhw_func_ops {
+
+ /*-------------- Initialisation ------------ */
+
+ /*! close down all hardware functional units - leaves NIC in a safe
+ state for driver unload */
+ void (*close_hardware) (struct efhw_nic *nic);
+
+ /*! initialise all hardware functional units */
+ int (*init_hardware) (struct efhw_nic *nic,
+ struct efhw_ev_handler *,
+ const uint8_t *mac_addr);
+
+ /*-------------- Interrupt support ------------ */
+
+ /*! Main interrupt routine
+ ** This function returns,
+ ** - zero, if the IRQ was not generated by EF1
+ ** - non-zero, if EF1 was the source of the IRQ
+ **
+ **
+ ** opaque is an OS provided pointer for use by the OS callbacks
+ ** e.g in Windows used to indicate DPC scheduled
+ */
+ int (*interrupt) (struct efhw_nic *nic);
+
+ /*! Enable given interrupt mask for the given IRQ unit */
+ void (*interrupt_enable) (struct efhw_nic *nic, uint idx);
+
+ /*! Disable given interrupt mask for the given IRQ unit */
+ void (*interrupt_disable) (struct efhw_nic *nic, uint idx);
+
+ /*! Set interrupt moderation strategy for the given IRQ unit
+ ** val is in usec
+ */
+ void (*set_interrupt_moderation)(struct efhw_nic *nic,
+ uint idx, uint val);
+
+ /*-------------- Event support ------------ */
+
+ /*! Enable the given event queue
+ depending on the underlying implementation (EF1 or Falcon) then
+ either a q_base_addr in host memory, or a buffer base id should
+ be proivded
+ */
+ void (*event_queue_enable) (struct efhw_nic *nic,
+ uint evq, /* evnt queue index */
+ uint evq_size, /* units of #entries */
+ dma_addr_t q_base_addr, uint buf_base_id);
+
+ /*! Disable the given event queue (and any associated timer) */
+ void (*event_queue_disable) (struct efhw_nic *nic, uint evq,
+ int timer_only);
+
+ /*! request wakeup from the NIC on a given event Q */
+ void (*wakeup_request) (struct efhw_nic *nic, dma_addr_t q_base_addr,
+ int next_i, int evq);
+
+ /*! Push a SW event on a given eventQ */
+ void (*sw_event) (struct efhw_nic *nic, int data, int evq);
+
+ /*-------------- Filter support ------------ */
+
+ /*! Setup a given filter - The software can request a filter_i,
+ * but some EtherFabric implementations will override with
+ * a more suitable index
+ */
+ int (*ipfilter_set) (struct efhw_nic *nic, int type,
+ int *filter_i, int dmaq,
+ unsigned saddr_be32, unsigned sport_be16,
+ unsigned daddr_be32, unsigned dport_be16);
+
+ /*! Attach a given filter to a DMAQ */
+ void (*ipfilter_attach) (struct efhw_nic *nic, int filter_idx,
+ int dmaq_idx);
+
+ /*! Detach a filter from its DMAQ */
+ void (*ipfilter_detach) (struct efhw_nic *nic, int filter_idx);
+
+ /*! Clear down a given filter */
+ void (*ipfilter_clear) (struct efhw_nic *nic, int filter_idx);
+
+ /*-------------- DMA support ------------ */
+
+ /*! Initialise NIC state for a given TX DMAQ */
+ void (*dmaq_tx_q_init) (struct efhw_nic *nic,
+ uint dmaq, uint evq, uint owner, uint tag,
+ uint dmaq_size, uint buf_idx, uint flags);
+
+ /*! Initialise NIC state for a given RX DMAQ */
+ void (*dmaq_rx_q_init) (struct efhw_nic *nic,
+ uint dmaq, uint evq, uint owner, uint tag,
+ uint dmaq_size, uint buf_idx, uint flags);
+
+ /*! Disable a given TX DMAQ */
+ void (*dmaq_tx_q_disable) (struct efhw_nic *nic, uint dmaq);
+
+ /*! Disable a given RX DMAQ */
+ void (*dmaq_rx_q_disable) (struct efhw_nic *nic, uint dmaq);
+
+ /*! Flush a given TX DMA channel */
+ int (*flush_tx_dma_channel) (struct efhw_nic *nic, uint dmaq);
+
+ /*! Flush a given RX DMA channel */
+ int (*flush_rx_dma_channel) (struct efhw_nic *nic, uint dmaq);
+
+ /*-------------- Buffer table Support ------------ */
+
+ /*! Initialise a buffer table page */
+ void (*buffer_table_set) (struct efhw_nic *nic,
+ dma_addr_t dma_addr,
+ uint bufsz, uint region,
+ int own_id, int buffer_id);
+
+ /*! Initialise a block of buffer table pages */
+ void (*buffer_table_set_n) (struct efhw_nic *nic, int buffer_id,
+ dma_addr_t dma_addr,
+ uint bufsz, uint region,
+ int n_pages, int own_id);
+
+ /*! Clear a block of buffer table pages */
+ void (*buffer_table_clear) (struct efhw_nic *nic, int buffer_id,
+ int num);
+
+ /*! Commit a buffer table update */
+ void (*buffer_table_commit) (struct efhw_nic *nic);
+
+};
+
+
+/*----------------------------------------------------------------------------
+ *
+ * NIC type
+ *
+ *---------------------------------------------------------------------------*/
+
+struct efhw_device_type {
+ int arch; /* enum efhw_arch */
+ char variant; /* 'A', 'B', ... */
+ int revision; /* 0, 1, ... */
+};
+
+
+/*----------------------------------------------------------------------------
+ *
+ * EtherFabric NIC instance - nic.c for HW independent functions
+ *
+ *---------------------------------------------------------------------------*/
+
+/*! */
+struct efhw_nic {
+ /*! zero base index in efrm_nic_table.nic array */
+ volatile int index;
+ int ifindex; /*!< OS level nic index */
+#ifdef HAS_NET_NAMESPACE
+ struct net *nd_net;
+#endif
+
+ struct efhw_device_type devtype;
+
+ /*! Options that can be set by user. */
+ unsigned options;
+# define NIC_OPT_EFTEST 0x1 /* owner is an eftest app */
+
+# define NIC_OPT_DEFAULT 0
+
+ /*! Internal flags that indicate hardware properties at runtime. */
+ unsigned flags;
+# define NIC_FLAG_NO_INTERRUPT 0x01 /* to be set at init time only */
+# define NIC_FLAG_TRY_MSI 0x02
+# define NIC_FLAG_MSI 0x04
+# define NIC_FLAG_OS_IRQ_EN 0x08
+# define NIC_FLAG_10G 0x10
+
+ unsigned mtu; /*!< MAC MTU (includes MAC hdr) */
+
+ /* hardware resources */
+
+ /*! I/O address of the start of the bar */
+ efhw_ioaddr_t bar_ioaddr;
+
+ /*! Bar number of control aperture. */
+ unsigned ctr_ap_bar;
+ /*! Length of control aperture in bytes. */
+ unsigned ctr_ap_bytes;
+
+ uint8_t mac_addr[ETH_ALEN]; /*!< mac address */
+
+ /*! EtherFabric Functional Units -- functions */
+ const struct efhw_func_ops *efhw_func;
+
+ /* Value read from FPGA version register. Zero for asic. */
+ unsigned fpga_version;
+
+ /*! This lock protects a number of misc NIC resources. It should
+ * only be used for things that can be at the bottom of the lock
+ * order. ie. You mustn't attempt to grab any other lock while
+ * holding this one.
+ */
+ spinlock_t *reg_lock;
+ spinlock_t the_reg_lock;
+
+ int buf_commit_outstanding; /*!< outstanding buffer commits */
+
+ /*! interrupt callbacks (hard-irq) */
+ void (*irq_handler) (struct efhw_nic *, int unit);
+
+ /*! event queues per driver */
+ struct efhw_keventq evq[EFHW_KEVENTQ_MAX];
+
+/* for marking when we are not using an IRQ unit
+ - 0 is a valid offset to an IRQ unit on EF1! */
+#define EFHW_IRQ_UNIT_UNUSED 0xffff
+ /*! interrupt unit in use */
+ unsigned int irq_unit[EFHW_KEVENTQ_MAX];
+ efhw_iopage_t irq_iobuff; /*!< Falcon SYSERR interrupt */
+
+ /* The new driverlink infrastructure. */
+ struct efx_dl_device *net_driver_dev;
+ struct efx_dlfilt_cb_s *dlfilter_cb;
+
+ /*! Bit masks of the sizes of event queues and dma queues supported
+ * by the nic. */
+ unsigned evq_sizes;
+ unsigned rxq_sizes;
+ unsigned txq_sizes;
+
+ /* Size of filter table (including odd and even banks). */
+ unsigned filter_tbl_size;
+};
+
+
+#define EFHW_KVA(nic) ((nic)->bar_ioaddr)
+
+
+#endif /* __CI_EFHW_EFHW_TYPES_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides version-independent Linux kernel API for header files
+ * with hardware-related definitions (in ci/driver/efab/hardware*).
+ * Only kernels >=2.6.9 are supported.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_HARDWARE_LINUX_H__
+#define __CI_EFHW_HARDWARE_LINUX_H__
+
+#include <asm/io.h>
+
+#ifdef __LITTLE_ENDIAN
+#define EFHW_IS_LITTLE_ENDIAN
+#elif __BIG_ENDIAN
+#define EFHW_IS_BIG_ENDIAN
+#else
+#error Unknown endianness
+#endif
+
+#ifndef mmiowb
+ #if defined(__i386__) || defined(__x86_64__)
+ #define mmiowb()
+ #elif defined(__ia64__)
+ #ifndef ia64_mfa
+ #define ia64_mfa() asm volatile ("mf.a" ::: "memory")
+ #endif
+ #define mmiowb ia64_mfa
+ #else
+ #error "Need definition for mmiowb()"
+ #endif
+#endif
+
+typedef char *efhw_ioaddr_t;
+
+#ifndef readq
+static inline uint64_t __readq(void __iomem *addr)
+{
+ return *(volatile uint64_t *)addr;
+}
+#define readq(x) __readq(x)
+#endif
+
+#ifndef writeq
+static inline void __writeq(uint64_t v, void __iomem *addr)
+{
+ *(volatile uint64_t *)addr = v;
+}
+#define writeq(val, addr) __writeq((val), (addr))
+#endif
+
+#endif /* __CI_EFHW_HARDWARE_LINUX_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides efhw_page_t and efhw_iopage_t for Linux kernel.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_IOPAGE_LINUX_H__
+#define __CI_EFHW_IOPAGE_LINUX_H__
+
+#include <linux/gfp.h>
+#include <linux/hardirq.h>
+#include <ci/efhw/debug.h>
+
+/*--------------------------------------------------------------------
+ *
+ * efhw_page_t: A single page of memory. Directly mapped in the driver,
+ * and can be mapped to userlevel.
+ *
+ *--------------------------------------------------------------------*/
+
+typedef struct {
+ unsigned long kva;
+} efhw_page_t;
+
+static inline int efhw_page_alloc(efhw_page_t *p)
+{
+ p->kva = __get_free_page(in_interrupt()? GFP_ATOMIC : GFP_KERNEL);
+ return p->kva ? 0 : -ENOMEM;
+}
+
+static inline int efhw_page_alloc_zeroed(efhw_page_t *p)
+{
+ p->kva = get_zeroed_page(in_interrupt()? GFP_ATOMIC : GFP_KERNEL);
+ return p->kva ? 0 : -ENOMEM;
+}
+
+static inline void efhw_page_free(efhw_page_t *p)
+{
+ free_page(p->kva);
+ EFHW_DO_DEBUG(memset(p, 0, sizeof(*p)));
+}
+
+static inline char *efhw_page_ptr(efhw_page_t *p)
+{
+ return (char *)p->kva;
+}
+
+static inline unsigned efhw_page_pfn(efhw_page_t *p)
+{
+ return (unsigned)(__pa(p->kva) >> PAGE_SHIFT);
+}
+
+static inline void efhw_page_mark_invalid(efhw_page_t *p)
+{
+ p->kva = 0;
+}
+
+static inline int efhw_page_is_valid(efhw_page_t *p)
+{
+ return p->kva != 0;
+}
+
+static inline void efhw_page_init_from_va(efhw_page_t *p, void *va)
+{
+ p->kva = (unsigned long)va;
+}
+
+/*--------------------------------------------------------------------
+ *
+ * efhw_iopage_t: A single page of memory. Directly mapped in the driver,
+ * and can be mapped to userlevel. Can also be accessed by the NIC.
+ *
+ *--------------------------------------------------------------------*/
+
+typedef struct {
+ efhw_page_t p;
+ dma_addr_t dma_addr;
+} efhw_iopage_t;
+
+static inline dma_addr_t efhw_iopage_dma_addr(efhw_iopage_t *p)
+{
+ return p->dma_addr;
+}
+
+#define efhw_iopage_ptr(iop) efhw_page_ptr(&(iop)->p)
+#define efhw_iopage_pfn(iop) efhw_page_pfn(&(iop)->p)
+#define efhw_iopage_mark_invalid(iop) efhw_page_mark_invalid(&(iop)->p)
+#define efhw_iopage_is_valid(iop) efhw_page_is_valid(&(iop)->p)
+
+/*--------------------------------------------------------------------
+ *
+ * efhw_iopages_t: A set of pages that are contiguous in physical memory.
+ * Directly mapped in the driver, and can be mapped to userlevel. Can also
+ * be accessed by the NIC.
+ *
+ * NB. The O/S may be unwilling to allocate many, or even any of these. So
+ * only use this type where the NIC really needs a physically contiguous
+ * buffer.
+ *
+ *--------------------------------------------------------------------*/
+
+typedef struct {
+ caddr_t kva;
+ unsigned order;
+ dma_addr_t dma_addr;
+} efhw_iopages_t;
+
+static inline caddr_t efhw_iopages_ptr(efhw_iopages_t *p)
+{
+ return p->kva;
+}
+
+static inline unsigned efhw_iopages_pfn(efhw_iopages_t *p)
+{
+ return (unsigned)(__pa(p->kva) >> PAGE_SHIFT);
+}
+
+static inline dma_addr_t efhw_iopages_dma_addr(efhw_iopages_t *p)
+{
+ return p->dma_addr;
+}
+
+static inline unsigned efhw_iopages_size(efhw_iopages_t *p)
+{
+ return 1u << (p->order + PAGE_SHIFT);
+}
+
+/* efhw_iopage_t <-> efhw_iopages_t conversions for handling physically
+ * contiguous allocations in iobufsets for iSCSI. This allows the
+ * essential information about contiguous allocations from
+ * efhw_iopages_alloc() to be saved away in the efhw_iopage_t array in an
+ * iobufset. (Changing the iobufset resource to use a union type would
+ * involve a lot of code changes, and make the iobufset's metadata larger
+ * which could be bad as it's supposed to fit into a single page on some
+ * platforms.)
+ */
+static inline void
+efhw_iopage_init_from_iopages(efhw_iopage_t *iopage,
+ efhw_iopages_t *iopages, unsigned pageno)
+{
+ iopage->p.kva = ((unsigned long)efhw_iopages_ptr(iopages))
+ + (pageno * PAGE_SIZE);
+ iopage->dma_addr = efhw_iopages_dma_addr(iopages) +
+ (pageno * PAGE_SIZE);
+}
+
+static inline void
+efhw_iopages_init_from_iopage(efhw_iopages_t *iopages,
+ efhw_iopage_t *iopage, unsigned order)
+{
+ iopages->kva = (caddr_t) efhw_iopage_ptr(iopage);
+ EFHW_ASSERT(iopages->kva);
+ iopages->order = order;
+ iopages->dma_addr = efhw_iopage_dma_addr(iopage);
+}
+
+#endif /* __CI_EFHW_IOPAGE_LINUX_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides public API of efhw library exported from the SFC
+ * resource driver.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_PUBLIC_H__
+#define __CI_EFHW_PUBLIC_H__
+
+#include <ci/efhw/common.h>
+#include <ci/efhw/efhw_types.h>
+
+/*! Returns true if we have some EtherFabric functional units -
+ whether configured or not */
+static inline int efhw_nic_have_functional_units(struct efhw_nic *nic)
+{
+ return nic->efhw_func != 0;
+}
+
+/*! Returns true if the EtherFabric functional units have been configured */
+static inline int efhw_nic_have_hw(struct efhw_nic *nic)
+{
+ return efhw_nic_have_functional_units(nic) && (EFHW_KVA(nic) != 0);
+}
+
+/*! Helper function to allocate the iobuffer needed by an eventq
+ * - it ensures the eventq has the correct alignment for the NIC
+ *
+ * \param rm Event-queue resource manager
+ * \param instance Event-queue instance (index)
+ * \param buf_bytes Requested size of eventq
+ * \return < 0 if iobuffer allocation fails
+ */
+int efhw_nic_event_queue_alloc_iobuffer(struct efhw_nic *nic,
+ struct eventq_resource_hardware *h,
+ int evq_instance, unsigned buf_bytes);
+
+extern void falcon_nic_set_rx_usr_buf_size(struct efhw_nic *,
+ int rx_usr_buf_size);
+
+extern void
+falcon_nic_rx_filter_ctl_set(struct efhw_nic *nic, uint32_t tcp_full,
+ uint32_t tcp_wild,
+ uint32_t udp_full, uint32_t udp_wild);
+
+extern void
+falcon_nic_rx_filter_ctl_get(struct efhw_nic *nic, uint32_t *tcp_full,
+ uint32_t *tcp_wild,
+ uint32_t *udp_full, uint32_t *udp_wild);
+
+#endif /* __CI_EFHW_PUBLIC_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides version-independent Linux kernel API for efhw library.
+ * Only kernels >=2.6.9 are supported.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFHW_SYSDEP_LINUX_H__
+#define __CI_EFHW_SYSDEP_LINUX_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/if_ether.h>
+
+#include <linux/netdevice.h> /* necessary for etherdevice.h on some kernels */
+#include <linux/etherdevice.h>
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,21)
+static inline int is_local_ether_addr(const u8 *addr)
+{
+ return (0x02 & addr[0]);
+}
+#endif
+
+typedef unsigned long irq_flags_t;
+
+#define spin_lock_destroy(l_) do {} while (0)
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+#define HAS_NET_NAMESPACE
+#endif
+
+/* Funny, but linux has round_up for x86 only, defined in
+ * x86-specific header */
+#ifndef round_up
+#define round_up(x, y) (((x) + (y) - 1) & ~((y)-1))
+#endif
+
+#endif /* __CI_EFHW_SYSDEP_LINUX_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides public API for NIC table.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFRM_NIC_TABLE_H__
+#define __CI_EFRM_NIC_TABLE_H__
+
+#include <ci/efhw/efhw_types.h>
+#include <ci/efrm/sysdep.h>
+
+/*--------------------------------------------------------------------
+ *
+ * struct efrm_nic_table - top level driver object keeping all NICs -
+ * implemented in driver_object.c
+ *
+ *--------------------------------------------------------------------*/
+
+/*! Comment? */
+struct efrm_nic_table {
+ /*! nics attached to this driver */
+ struct efhw_nic *nic[EFHW_MAX_NR_DEVS];
+ /*! pointer to an arbitrary struct efhw_nic if one exists;
+ * for code which does not care which NIC it wants but
+ * still needs one. Note you cannot assume nic[0] exists. */
+ struct efhw_nic *a_nic;
+ uint32_t nic_count; /*!< number of nics attached to this driver */
+ spinlock_t lock; /*!< lock for table modifications */
+ atomic_t ref_count; /*!< refcount for users of nic table */
+};
+
+/* Resource driver structures used by other drivers as well */
+extern struct efrm_nic_table *efrm_nic_tablep;
+
+static inline void efrm_nic_table_hold(void)
+{
+ atomic_inc(&efrm_nic_tablep->ref_count);
+}
+
+static inline void efrm_nic_table_rele(void)
+{
+ atomic_dec(&efrm_nic_tablep->ref_count);
+}
+
+static inline int efrm_nic_table_held(void)
+{
+ return (atomic_read(&efrm_nic_tablep->ref_count) != 0);
+}
+
+/* Run code block _x multiple times with variable nic set to each
+ * registered NIC in turn.
+ * DO NOT "break" out of this loop early. */
+#define EFRM_FOR_EACH_NIC(_nic_i, _nic) \
+ for ((_nic_i) = (efrm_nic_table_hold(), 0); \
+ (_nic_i) < EFHW_MAX_NR_DEVS || (efrm_nic_table_rele(), 0); \
+ (_nic_i)++) \
+ if (((_nic) = efrm_nic_tablep->nic[_nic_i]))
+
+#define EFRM_FOR_EACH_NIC_IN_SET(_set, _i, _nic) \
+ for ((_i) = (efrm_nic_table_hold(), 0); \
+ (_i) < EFHW_MAX_NR_DEVS || (efrm_nic_table_rele(), 0); \
+ ++(_i)) \
+ if (((_nic) = efrm_nic_tablep->nic[_i]) && \
+ efrm_nic_set_read((_set), (_i)))
+
+#endif /* __CI_EFRM_NIC_TABLE_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides Linux-like system-independent API for efrm library.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFRM_SYSDEP_H__
+#define __CI_EFRM_SYSDEP_H__
+
+/* Spinlocks are defined in efhw/sysdep.h */
+#include <ci/efhw/sysdep.h>
+
+#if defined(__linux__) && defined(__KERNEL__)
+
+# include <ci/efrm/sysdep_linux.h>
+
+#else
+
+# include <ci/efrm/sysdep_ci2linux.h>
+
+#endif
+
+#endif /* __CI_EFRM_SYSDEP_H__ */
--- /dev/null
+/****************************************************************************
+ * Driver for Solarflare network controllers -
+ * resource management for Xen backend, OpenOnload, etc
+ * (including support for SFE4001 10GBT NIC)
+ *
+ * This file provides version-independent Linux kernel API for efrm library.
+ * Only kernels >=2.6.9 are supported.
+ *
+ * Copyright 2005-2007: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Kfifo API is partially stolen from linux-2.6.22/include/linux/list.h
+ * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ *
+ * Developed and maintained by Solarflare Communications:
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * Certain parts of the driver were implemented by
+ * Alexandra Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
+ * OKTET Labs Ltd, Russia,
+ * http://oktetlabs.ru, <info@oktetlabs.ru>
+ * by request of Solarflare Communications
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef __CI_EFRM_SYSDEP_LINUX_H__
+#define __CI_EFRM_SYSDEP_LINUX_H__
+
+#include <linux/version.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/kernel.h>
+#include <linux/if_ether.h>
+#include <linux/completion.h>
+#include <linux/in.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+/* get roundup_pow_of_two(), which was in kernel.h in early kernel versions */
+#include <linux/log2.h>
+#endif
+
+/********************************************************************
+ *
+ * List API
+ *
+ ********************************************************************/
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18)
+static inline void
+list_replace_init(struct list_head *old, struct list_head *new)
+{
+ new->next = old->next;
+ new->next->prev = new;
+ new->prev = old->prev;
+ new->prev->next = new;
+ INIT_LIST_HEAD(old);
+}
+#endif
+
+static inline struct list_head *list_pop(struct list_head *list)
+{
+ struct list_head *link = list->next;
+ list_del(link);
+ return link;
+}
+
+static inline struct list_head *list_pop_tail(struct list_head *list)
+{
+ struct list_head *link = list->prev;
+ list_del(link);
+ return link;
+}
+
+/********************************************************************
+ *
+ * Workqueue API
+ *
+ ********************************************************************/
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+#define NEED_OLD_WORK_API
+
+/**
+ * The old and new work function prototypes just change
+ * the type of the pointer in the only argument, so it's
+ * safe to cast one function type to the other
+ */
+typedef void (*efrm_old_work_func_t) (void *p);
+
+#undef INIT_WORK
+#define INIT_WORK(_work, _func) \
+ do { \
+ INIT_LIST_HEAD(&(_work)->entry); \
+ (_work)->pending = 0; \
+ PREPARE_WORK((_work), \
+ (efrm_old_work_func_t) (_func), \
+ (_work)); \
+ } while (0)
+
+#endif
+
+/********************************************************************
+ *
+ * Kfifo API
+ *
+ ********************************************************************/
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10)
+
+#if !defined(RHEL_RELEASE_CODE) || (RHEL_RELEASE_CODE < 1029)
+typedef unsigned gfp_t;
+#endif
+
+#define HAS_NO_KFIFO
+
+struct kfifo {
+ unsigned char *buffer; /* the buffer holding the data */
+ unsigned int size; /* the size of the allocated buffer */
+ unsigned int in; /* data is added at offset (in % size) */
+ unsigned int out; /* data is extracted from off. (out % size) */
+ spinlock_t *lock; /* protects concurrent modifications */
+};
+
+extern struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
+ gfp_t gfp_mask, spinlock_t *lock);
+extern struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask,
+ spinlock_t *lock);
+extern void kfifo_free(struct kfifo *fifo);
+extern unsigned int __kfifo_put(struct kfifo *fifo,
+ unsigned char *buffer, unsigned int len);
+extern unsigned int __kfifo_get(struct kfifo *fifo,
+ unsigned char *buffer, unsigned int len);
+
+/**
+ * kfifo_put - puts some data into the FIFO
+ * @fifo: the fifo to be used.
+ * @buffer: the data to be added.
+ * @len: the length of the data to be added.
+ *
+ * This function copies at most @len bytes from the @buffer into
+ * the FIFO depending on the free space, and returns the number of
+ * bytes copied.
+ */
+static inline unsigned int
+kfifo_put(struct kfifo *fifo, unsigned char *buffer, unsigned int len)
+{
+ unsigned long flags;
+ unsigned int ret;
+
+ spin_lock_irqsave(fifo->lock, flags);
+
+ ret = __kfifo_put(fifo, buffer, len);
+
+ spin_unlock_irqrestore(fifo->lock, flags);
+
+ return ret;
+}
+
+/**
+ * kfifo_get - gets some data from the FIFO
+ * @fifo: the fifo to be used.
+ * @buffer: where the data must be copied.
+ * @len: the size of the destination buffer.
+ *
+ * This function copies at most @len bytes from the FIFO into the
+ * @buffer and returns the number of copied bytes.
+ */
+static inline unsigned int
+kfifo_get(struct kfifo *fifo, unsigned char *buffer, unsigned int len)
+{
+ unsigned long flags;
+ unsigned int ret;
+
+ spin_lock_irqsave(fifo->lock, flags);
+
+ ret = __kfifo_get(fifo, buffer, len);
+
+ /*
+ * optimization: if the FIFO is empty, set the indices to 0
+ * so we don't wrap the next time
+ */
+ if (fifo->in == fifo->out)
+ fifo->in = fifo->out = 0;
+
+ spin_unlock_irqrestore(fifo->lock, flags);
+
+ return ret;
+}
+
+/**
+ * __kfifo_len - returns the number of bytes available in the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ */
+static inline unsigned int __kfifo_len(struct kfifo *fifo)
+{
+ return fifo->in - fifo->out;
+}
+
+/**
+ * kfifo_len - returns the number of bytes available in the FIFO
+ * @fifo: the fifo to be used.
+ */
+static inline unsigned int kfifo_len(struct kfifo *fifo)
+{
+ unsigned long flags;
+ unsigned int ret;
+
+ spin_lock_irqsave(fifo->lock, flags);
+
+ ret = __kfifo_len(fifo);
+
+ spin_unlock_irqrestore(fifo->lock, flags);
+
+ return ret;
+}
+
+#else
+#include <linux/kfifo.h>
+#endif
+
+static inline void kfifo_vfree(struct kfifo *fifo)
+{
+ vfree(fifo->buffer);
+ kfree(fifo);
+}
+
+#endif /* __CI_EFRM_SYSDEP_LINUX_H__ */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_CONFIG_H__
+#define __CI_TOOLS_CONFIG_H__
+
+
+/**********************************************************************
+ * Debugging.
+ */
+
+#define CI_INCLUDE_ASSERT_VALID 0
+
+/* Set non-zero to allow info about who has allocated what to appear in
+ * /proc/drivers/level5/mem.
+ * However - Note that doing so can lead to segfault when you unload the
+ * driver, and other weirdness. i.e. I don't think the code for is quite
+ * right (written by Oktet, hacked by gel), but it does work well enough to be
+ * useful.
+ */
+#define CI_MEMLEAK_DEBUG_ALLOC_TABLE 0
+
+
+#endif /* __CI_TOOLS_CONFIG_H__ */
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_DEBUG_H__
+#define __CI_TOOLS_DEBUG_H__
+
+#define CI_LOG_E(x) x /* errors */
+#define CI_LOG_W(x) x /* warnings */
+#define CI_LOG_I(x) x /* information */
+#define CI_LOG_V(x) x /* verbose */
+
+/* Build time asserts. We paste the line number into the type name
+ * so that the macro can be used more than once per file even if the
+ * compiler objects to multiple identical typedefs. Collisions
+ * between use in different header files is still possible. */
+#ifndef CI_BUILD_ASSERT
+#define __CI_BUILD_ASSERT_NAME(_x) __CI_BUILD_ASSERT_ILOATHECPP(_x)
+#define __CI_BUILD_ASSERT_ILOATHECPP(_x) __CI_BUILD_ASSERT__ ##_x
+#define CI_BUILD_ASSERT(e)\
+ typedef char __CI_BUILD_ASSERT_NAME(__LINE__)[(e)?1:-1]
+#endif
+
+
+#ifdef NDEBUG
+
+# define _ci_check(exp, file, line)
+# define _ci_assert2(e, x, y, file, line)
+# define _ci_assert(exp, file, line)
+# define _ci_assert_equal(exp1, exp2, file, line)
+# define _ci_assert_equiv(exp1, exp2, file, line)
+# define _ci_assert_nequal(exp1, exp2, file, line)
+# define _ci_assert_le(exp1, exp2, file, line)
+# define _ci_assert_lt(exp1, exp2, file, line)
+# define _ci_assert_ge(exp1, exp2, file, line)
+# define _ci_assert_gt(exp1, exp2, file, line)
+# define _ci_assert_impl(exp1, exp2, file, line)
+
+# define _ci_verify(exp, file, line) \
+ do { \
+ (void)(exp); \
+ } while (0)
+
+# define CI_DEBUG_TRY(exp) \
+ do { \
+ (void)(exp); \
+ } while (0)
+
+#define CI_TRACE(exp,fmt)
+#define CI_TRACE_INT(integer)
+#define CI_TRACE_INT32(integer)
+#define CI_TRACE_INT64(integer)
+#define CI_TRACE_UINT(integer)
+#define CI_TRACE_UINT32(integer)
+#define CI_TRACE_UINT64(integer)
+#define CI_TRACE_HEX(integer)
+#define CI_TRACE_HEX32(integer)
+#define CI_TRACE_HEX64(integer)
+#define CI_TRACE_PTR(pointer)
+#define CI_TRACE_STRING(string)
+#define CI_TRACE_MAC(mac)
+#define CI_TRACE_IP(ip_be32)
+#define CI_TRACE_ARP(arp_pkt)
+
+#else
+
+# define _CI_ASSERT_FMT "\nfrom %s:%d"
+
+# define _ci_check(exp, file, line) \
+ do { \
+ if (CI_UNLIKELY(!(exp))) \
+ ci_warn(("ci_check(%s)"_CI_ASSERT_FMT, #exp, \
+ (file), (line))); \
+ } while (0)
+
+/*
+ * NOTE: ci_fail() emits the file and line where the assert is actually
+ * coded.
+ */
+
+# define _ci_assert(exp, file, line) \
+ do { \
+ if (CI_UNLIKELY(!(exp))) \
+ ci_fail(("ci_assert(%s)"_CI_ASSERT_FMT, #exp, \
+ (file), (line))); \
+ } while (0)
+
+# define _ci_assert2(e, x, y, file, line) do { \
+ if(CI_UNLIKELY( ! (e) )) \
+ ci_fail(("ci_assert(%s)\nwhere [%s=%"CI_PRIx64"] " \
+ "[%s=%"CI_PRIx64"]\nat %s:%d\nfrom %s:%d", #e \
+ , #x, (ci_uint64)(ci_uintptr_t)(x) \
+ , #y, (ci_uint64)(ci_uintptr_t)(y), \
+ __FILE__, __LINE__, (file), (line))); \
+ } while (0)
+
+# define _ci_verify(exp, file, line) \
+ do { \
+ if (CI_UNLIKELY(!(exp))) \
+ ci_fail(("ci_verify(%s)"_CI_ASSERT_FMT, #exp, \
+ (file), (line))); \
+ } while (0)
+
+# define _ci_assert_equal(x, y, f, l) _ci_assert2((x)==(y), x, y, (f), (l))
+# define _ci_assert_nequal(x, y, f, l) _ci_assert2((x)!=(y), x, y, (f), (l))
+# define _ci_assert_le(x, y, f, l) _ci_assert2((x)<=(y), x, y, (f), (l))
+# define _ci_assert_lt(x, y, f, l) _ci_assert2((x)< (y), x, y, (f), (l))
+# define _ci_assert_ge(x, y, f, l) _ci_assert2((x)>=(y), x, y, (f), (l))
+# define _ci_assert_gt(x, y, f, l) _ci_assert2((x)> (y), x, y, (f), (l))
+# define _ci_assert_or(x, y, f, l) _ci_assert2((x)||(y), x, y, (f), (l))
+# define _ci_assert_impl(x, y, f, l) _ci_assert2(!(x) || (y), x, y, (f), (l))
+# define _ci_assert_equiv(x, y, f, l) _ci_assert2(!(x)== !(y), x, y, (f), (l))
+
+#define _ci_assert_equal_msg(exp1, exp2, msg, file, line) \
+ do { \
+ if (CI_UNLIKELY((exp1)!=(exp2))) \
+ ci_fail(("ci_assert_equal_msg(%s == %s) were " \
+ "(%"CI_PRIx64":%"CI_PRIx64") with msg[%c%c%c%c]" \
+ _CI_ASSERT_FMT, #exp1, #exp2, \
+ (ci_uint64)(ci_uintptr_t)(exp1), \
+ (ci_uint64)(ci_uintptr_t)(exp2), \
+ (((ci_uint32)msg) >> 24) && 0xff, \
+ (((ci_uint32)msg) >> 16) && 0xff, \
+ (((ci_uint32)msg) >> 8 ) && 0xff, \
+ (((ci_uint32)msg) ) && 0xff, \
+ (file), (line))); \
+ } while (0)
+
+# define CI_DEBUG_TRY(exp) CI_TRY(exp)
+
+#define CI_TRACE(exp,fmt) \
+ ci_log("%s:%d:%s] " #exp "=" fmt, \
+ __FILE__, __LINE__, __FUNCTION__, (exp))
+
+
+#define CI_TRACE_INT(integer) \
+ ci_log("%s:%d:%s] " #integer "=%d", \
+ __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_INT32(integer) \
+ ci_log("%s:%d:%s] " #integer "=%d", \
+ __FILE__, __LINE__, __FUNCTION__, ((ci_int32)integer))
+
+
+#define CI_TRACE_INT64(integer) \
+ ci_log("%s:%d:%s] " #integer "=%lld", \
+ __FILE__, __LINE__, __FUNCTION__, ((ci_int64)integer))
+
+
+#define CI_TRACE_UINT(integer) \
+ ci_log("%s:%d:%s] " #integer "=%ud", \
+ __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_UINT32(integer) \
+ ci_log("%s:%d:%s] " #integer "=%ud", \
+ __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
+
+
+#define CI_TRACE_UINT64(integer) \
+ ci_log("%s:%d:%s] " #integer "=%ulld", \
+ __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
+
+
+#define CI_TRACE_HEX(integer) \
+ ci_log("%s:%d:%s] " #integer "=0x%x", \
+ __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_HEX32(integer) \
+ ci_log("%s:%d:%s] " #integer "=0x%x", \
+ __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
+
+
+#define CI_TRACE_HEX64(integer) \
+ ci_log("%s:%d:%s] " #integer "=0x%llx", \
+ __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
+
+
+#define CI_TRACE_PTR(pointer) \
+ ci_log("%s:%d:%s] " #pointer "=0x%p", \
+ __FILE__, __LINE__, __FUNCTION__, (pointer))
+
+
+#define CI_TRACE_STRING(string) \
+ ci_log("%s:%d:%s] " #string "=%s", \
+ __FILE__, __LINE__, __FUNCTION__, (string))
+
+
+#define CI_TRACE_MAC(mac) \
+ ci_log("%s:%d:%s] " #mac "=" CI_MAC_PRINTF_FORMAT, \
+ __FILE__, __LINE__, __FUNCTION__, CI_MAC_PRINTF_ARGS(mac))
+
+
+#define CI_TRACE_IP(ip_be32) \
+ ci_log("%s:%d:%s] " #ip_be32 "=" CI_IP_PRINTF_FORMAT, __FILE__, \
+ __LINE__, __FUNCTION__, CI_IP_PRINTF_ARGS(&(ip_be32)))
+
+
+#define CI_TRACE_ARP(arp_pkt) \
+ ci_log("%s:%d:%s]\n"CI_ARP_PRINTF_FORMAT, \
+ __FILE__, __LINE__, __FUNCTION__, CI_ARP_PRINTF_ARGS(arp_pkt))
+
+#endif /* NDEBUG */
+
+#define ci_check(exp) \
+ _ci_check(exp, __FILE__, __LINE__)
+
+#define ci_assert(exp) \
+ _ci_assert(exp, __FILE__, __LINE__)
+
+#define ci_verify(exp) \
+ _ci_verify(exp, __FILE__, __LINE__)
+
+#define ci_assert_equal(exp1, exp2) \
+ _ci_assert_equal(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_equal_msg(exp1, exp2, msg) \
+ _ci_assert_equal_msg(exp1, exp2, msg, __FILE__, __LINE__)
+
+#define ci_assert_nequal(exp1, exp2) \
+ _ci_assert_nequal(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_le(exp1, exp2) \
+ _ci_assert_le(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_lt(exp1, exp2) \
+ _ci_assert_lt(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_ge(exp1, exp2) \
+ _ci_assert_ge(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_gt(exp1, exp2) \
+ _ci_assert_gt(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_impl(exp1, exp2) \
+ _ci_assert_impl(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_equiv(exp1, exp2) \
+ _ci_assert_equiv(exp1, exp2, __FILE__, __LINE__)
+
+
+#define CI_TEST(exp) \
+ do{ \
+ if( CI_UNLIKELY(!(exp)) ) \
+ ci_fail(("CI_TEST(%s)", #exp)); \
+ }while(0)
+
+
+#define CI_TRY(exp) \
+ do{ \
+ int _trc; \
+ _trc=(exp); \
+ if( CI_UNLIKELY(_trc < 0) ) \
+ ci_sys_fail(#exp, _trc); \
+ }while(0)
+
+
+#define CI_TRY_RET(exp) \
+ do{ \
+ int _trc; \
+ _trc=(exp); \
+ if( CI_UNLIKELY(_trc < 0) ) { \
+ ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__); \
+ return _trc; \
+ } \
+ }while(0)
+
+#define CI_LOGLEVEL_TRY_RET(logfn, exp) \
+ do{ \
+ int _trc; \
+ _trc=(exp); \
+ if( CI_UNLIKELY(_trc < 0) ) { \
+ logfn (ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__)); \
+ return _trc; \
+ } \
+ }while(0)
+
+
+#define CI_SOCK_TRY(exp) \
+ do{ \
+ ci_sock_err_t _trc; \
+ _trc=(exp); \
+ if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) \
+ ci_sys_fail(#exp, _trc.val); \
+ }while(0)
+
+
+#define CI_SOCK_TRY_RET(exp) \
+ do{ \
+ ci_sock_err_t _trc; \
+ _trc=(exp); \
+ if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) { \
+ ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
+ return ci_sock_errcode(_trc); \
+ } \
+ }while(0)
+
+
+#define CI_SOCK_TRY_SOCK_RET(exp) \
+ do{ \
+ ci_sock_err_t _trc; \
+ _trc=(exp); \
+ if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) { \
+ ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
+ return _trc; \
+ } \
+ }while(0)
+
+#endif /* __CI_TOOLS_DEBUG_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Functions for logging and pretty-printing.
+ * \date 2002/08/07
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_LOG_H__
+#define __CI_TOOLS_LOG_H__
+
+#include <stdarg.h>
+
+
+/**********************************************************************
+ * Logging.
+ */
+
+/* size of internal log buffer */
+#define CI_LOG_MAX_LINE 512
+/* uses of ci_log must ensure that all trace messages are shorter than this */
+#define CI_LOG_MAX_MSG_LENGTH (CI_LOG_MAX_LINE-50)
+
+extern void ci_vlog(const char* fmt, va_list args) CI_HF;
+extern void ci_log(const char* fmt, ...) CI_PRINTF_LIKE(1,2) CI_HF;
+
+ /*! Set the prefix for log messages.
+ **
+ ** Uses the storage pointed to by \em prefix. Therefore \em prefix must
+ ** be allocated on the heap, or statically.
+ */
+extern void ci_set_log_prefix(const char* prefix) CI_HF;
+
+typedef void (*ci_log_fn_t)(const char* msg);
+extern ci_log_fn_t ci_log_fn CI_HV;
+
+/* Log functions. */
+extern void ci_log_null(const char* msg) CI_HF;
+extern void ci_log_stderr(const char* msg) CI_HF;
+extern void ci_log_stdout(const char* msg) CI_HF;
+extern void ci_log_syslog(const char* msg) CI_HF;
+
+/*! Call the following to install special logging behaviours. */
+extern void ci_log_buffer_till_fail(void) CI_HF;
+extern void ci_log_buffer_till_exit(void) CI_HF;
+
+extern void __ci_log_unique(const char* msg) CI_HF;
+extern ci_log_fn_t __ci_log_unique_fn CI_HV;
+ci_inline void ci_log_uniquify(void) {
+ if( ci_log_fn != __ci_log_unique ) {
+ __ci_log_unique_fn = ci_log_fn;
+ ci_log_fn = __ci_log_unique;
+ }
+}
+
+extern void ci_log_file(const char* msg) CI_HF;
+extern int ci_log_file_fd CI_HV;
+
+extern void __ci_log_nth(const char* msg) CI_HF;
+extern ci_log_fn_t __ci_log_nth_fn CI_HV;
+extern int ci_log_nth_n CI_HV; /* default 100 */
+ci_inline void ci_log_nth(void) {
+ if( ci_log_fn != __ci_log_nth ) {
+ __ci_log_nth_fn = ci_log_fn;
+ ci_log_fn = __ci_log_nth;
+ }
+}
+
+extern int ci_log_level CI_HV;
+
+extern int ci_log_options CI_HV;
+#define CI_LOG_PID 0x1
+#define CI_LOG_TID 0x2
+#define CI_LOG_TIME 0x4
+#define CI_LOG_DELTA 0x8
+
+/**********************************************************************
+ * Used to define which mode we are in
+ */
+#if (defined(_WIN32) && !defined(__KERNEL__))
+typedef enum {
+ ci_log_md_NULL=0,
+ ci_log_md_ioctl,
+ ci_log_md_stderr,
+ ci_log_md_stdout,
+ ci_log_md_file,
+ ci_log_md_serial,
+ ci_log_md_syslog,
+ ci_log_md_pidfile
+} ci_log_mode_t;
+extern ci_log_mode_t ci_log_mode;
+#endif
+
+/**********************************************************************
+ * Pretty-printing.
+ */
+
+extern char ci_printable_char(char c) CI_HF;
+
+extern void (*ci_hex_dump_formatter)(char* buf, const ci_octet* s,
+ int i, int off, int len) CI_HV;
+extern void ci_hex_dump_format_octets(char*,const ci_octet*,int,int,int) CI_HF;
+extern void ci_hex_dump_format_dwords(char*,const ci_octet*,int,int,int) CI_HF;
+
+extern void ci_hex_dump_row(char* buf, volatile const void* s, int len,
+ ci_ptr_arith_t address) CI_HF;
+ /*!< A row contains up to 16 bytes. Row starts at [address & 15u], so
+ ** therefore [len + (address & 15u)] must be <= 16.
+ */
+
+extern void ci_hex_dump(ci_log_fn_t, volatile const void*,
+ int len, ci_ptr_arith_t address) CI_HF;
+
+extern int ci_hex_dump_to_raw(const char* src_hex, void* buf,
+ unsigned* addr_out_opt, int* skip) CI_HF;
+ /*!< Recovers raw data from a single line of a hex dump. [buf] must be at
+ ** least 16 bytes long. Returns the number of bytes written to [buf] (in
+ ** range 1 -> 16), or -1 if [src_hex] doesn't contain hex data. Does not
+ ** cope with missing bytes at the start of a line.
+ */
+
+extern int ci_format_eth_addr(char* buf, const void* eth_mac_addr,
+ char sep) CI_HF;
+ /*!< This will write 18 characters to <buf> including terminating null.
+ ** Returns number of bytes written excluding null. If [sep] is zero, ':'
+ ** is used.
+ */
+
+extern int ci_parse_eth_addr(void* eth_mac_addr,
+ const char* str, char sep) CI_HF;
+ /*!< If [sep] is zero, absolutely any separator is accepted (even
+ ** inconsistent separators). Returns 0 on success, -1 on error.
+ */
+
+extern int ci_format_ip4_addr(char* buf, unsigned addr_be32) CI_HF;
+ /*!< Formats the IP address (in network endian) in dotted-quad. Returns
+ ** the number of bytes written (up to 15), excluding the null. [buf]
+ ** must be at least 16 bytes long.
+ */
+
+
+/**********************************************************************
+ * Error checking.
+ */
+
+extern void (*ci_fail_stop_fn)(void) CI_HV;
+
+extern void ci_fail_stop(void) CI_HF;
+extern void ci_fail_hang(void) CI_HF;
+extern void ci_fail_bomb(void) CI_HF;
+extern void ci_backtrace(void) CI_HF;
+
+#if defined __linux__ && !defined __KERNEL__
+extern void ci_fail_abort (void) CI_HF;
+#endif
+
+#ifdef __GNUC__
+extern void
+__ci_fail(const char*, ...) CI_PRINTF_LIKE(1,2) CI_HF;
+#else
+# if _PREFAST_
+ extern void _declspec(noreturn) __ci_fail(const char* fmt, ...);
+# else
+ extern void __ci_fail(const char* fmt, ...);
+# endif
+
+#endif
+
+#define ci_warn(x) \
+ do{ ci_log("WARN at %s:%d", __FILE__, __LINE__); }while(0)
+
+#define ci_fail(x) \
+ do{ ci_log("FAIL at %s:%d", __FILE__, __LINE__); __ci_fail x; }while(0)
+
+extern void __ci_sys_fail(const char* fn, int rc,
+ const char* file, int line) CI_HF;
+#define ci_sys_fail(fn, rc) __ci_sys_fail(fn, rc, __FILE__, __LINE__)
+
+/**********************************************************************
+ * Logging to buffer (src/citools/log_buffer.c)
+ */
+
+/*! Divert ci_log() messages to the log buffer
+ * normally they go to the system console */
+extern void ci_log_buffer_till_fail(void) CI_HF;
+
+/*! Dump the contents of the log buffer to the system console */
+extern void ci_log_buffer_dump(void) CI_HF;
+
+
+/**********************************************************************
+ * Some useful pretty-printing.
+ */
+
+#ifdef __linux__
+# define CI_SOCKCALL_FLAGS_FMT "%s%s%s%s%s%s%s%s%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x) \
+ (((x) & MSG_OOB ) ? "OOB " :""), \
+ (((x) & MSG_PEEK ) ? "PEEK " :""), \
+ (((x) & MSG_DONTROUTE ) ? "DONTROUTE " :""), \
+ (((x) & MSG_EOR ) ? "EOR " :""), \
+ (((x) & MSG_CTRUNC ) ? "CTRUNC " :""), \
+ (((x) & MSG_TRUNC ) ? "TRUNC " :""), \
+ (((x) & MSG_WAITALL ) ? "WAITALL " :""), \
+ (((x) & MSG_DONTWAIT ) ? "DONTWAIT " :""), \
+ (((x) & MSG_NOSIGNAL ) ? "NOSIGNAL " :""), \
+ (((x) & MSG_ERRQUEUE ) ? "ERRQUEUE " :""), \
+ (((x) & MSG_CONFIRM ) ? "CONFIRM " :"")
+#endif
+
+#ifdef _WIN32
+# define CI_SOCKCALL_FLAGS_FMT "%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x) \
+ (((x) & MSG_OOB ) ? "OOB " :""), \
+ (((x) & MSG_PEEK ) ? "PEEK " :""), \
+ (((x) & MSG_DONTROUTE ) ? "DONTROUTE " :"")
+#endif
+
+#ifdef __sun__
+# define CI_SOCKCALL_FLAGS_FMT "%s%s%s%s%s%s%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x) \
+ (((x) & MSG_OOB ) ? "OOB " :""), \
+ (((x) & MSG_PEEK ) ? "PEEK " :""), \
+ (((x) & MSG_DONTROUTE ) ? "DONTROUTE " :""), \
+ (((x) & MSG_EOR ) ? "EOR " :""), \
+ (((x) & MSG_CTRUNC ) ? "CTRUNC " :""), \
+ (((x) & MSG_TRUNC ) ? "TRUNC " :""), \
+ (((x) & MSG_WAITALL ) ? "WAITALL " :""), \
+ (((x) & MSG_DONTWAIT ) ? "DONTWAIT " :""), \
+ (((x) & MSG_NOTIFICATION) ? "NOTIFICATION" :"")
+#endif
+
+#endif /* __CI_TOOLS_LOG_H__ */
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools_platform */
+
+#ifndef __CI_TOOLS_GCC_X86_H__
+#define __CI_TOOLS_GCC_X86_H__
+
+
+/**********************************************************************
+ * Free-running cycle counters.
+ */
+
+#define CI_HAVE_FRC64
+#define CI_HAVE_FRC32
+
+#define ci_frc32(pval) __asm__ __volatile__("rdtsc" : "=a" (*pval) : : "edx")
+
+#if defined(__x86_64__)
+ci_inline void ci_frc64(ci_uint64* pval) {
+ /* temp fix until we figure how to get this out in one bite */
+ ci_uint64 low, high;
+ __asm__ __volatile__("rdtsc" : "=a" (low) , "=d" (high));
+ *pval = (high << 32) | low;
+}
+
+#else
+#define ci_frc64(pval) __asm__ __volatile__("rdtsc" : "=A" (*pval))
+#endif
+
+#define ci_frc_flush() /* ?? Need a pipeline barrier. */
+
+
+/**********************************************************************
+ * Atomic integer.
+ */
+
+/*
+** int ci_atomic_read(a) { return a->n; }
+** void ci_atomic_set(a, v) { a->n = v; }
+** void ci_atomic_inc(a) { ++a->n; }
+** void ci_atomic_dec(a) { --a->n; }
+** int ci_atomic_inc_and_test(a) { return ++a->n == 0; }
+** int ci_atomic_dec_and_test(a) { return --a->n == 0; }
+** void ci_atomic_and(a, v) { a->n &= v; }
+** void ci_atomic_or(a, v) { a->n |= v; }
+*/
+
+typedef struct { volatile ci_int32 n; } ci_atomic_t;
+
+#define CI_ATOMIC_INITIALISER(i) {(i)}
+
+static inline ci_int32 ci_atomic_read(const ci_atomic_t* a) { return a->n; }
+static inline void ci_atomic_set(ci_atomic_t* a, int v) { a->n = v; ci_wmb(); }
+
+static inline void ci_atomic_inc(ci_atomic_t* a)
+{ __asm__ __volatile__("lock; incl %0" : "+m" (a->n)); }
+
+
+static inline void ci_atomic_dec(ci_atomic_t* a)
+{ __asm__ __volatile__("lock; decl %0" : "+m" (a->n)); }
+
+static inline int ci_atomic_inc_and_test(ci_atomic_t* a) {
+ char r;
+ __asm__ __volatile__("lock; incl %0; sete %1"
+ : "+m" (a->n), "=qm" (r));
+ return r;
+}
+
+static inline int ci_atomic_dec_and_test(ci_atomic_t* a) {
+ char r;
+ __asm__ __volatile__("lock; decl %0; sete %1"
+ : "+m" (a->n), "=qm" (r));
+ return r;
+}
+
+ci_inline int
+ci_atomic_xadd (ci_atomic_t *a, int v) {
+ __asm__ ("lock xadd %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
+ return v;
+}
+ci_inline int
+ci_atomic_xchg (ci_atomic_t *a, int v) {
+ __asm__ ("lock xchg %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
+ return v;
+}
+
+ci_inline void ci_atomic32_or(volatile ci_uint32* p, ci_uint32 mask)
+{ __asm__ __volatile__("lock; orl %1, %0" : "+m" (*p) : "ir" (mask)); }
+
+ci_inline void ci_atomic32_and(volatile ci_uint32* p, ci_uint32 mask)
+{ __asm__ __volatile__("lock; andl %1, %0" : "+m" (*p) : "ir" (mask)); }
+
+ci_inline void ci_atomic32_add(volatile ci_uint32* p, ci_uint32 v)
+{ __asm__ __volatile__("lock; addl %1, %0" : "+m" (*p) : "ir" (v)); }
+
+#define ci_atomic_or(a, v) ci_atomic32_or ((ci_uint32*) &(a)->n, (v))
+#define ci_atomic_and(a, v) ci_atomic32_and((ci_uint32*) &(a)->n, (v))
+#define ci_atomic_add(a, v) ci_atomic32_add((ci_uint32*) &(a)->n, (v))
+
+extern int ci_glibc_uses_nptl (void) CI_HF;
+extern int ci_glibc_nptl_broken(void) CI_HF;
+extern int ci_glibc_gs_get_is_multihreaded_offset (void) CI_HF;
+extern int ci_glibc_gs_is_multihreaded_offset CI_HV;
+
+#if !defined(__x86_64__)
+#ifdef __GLIBC__
+/* Returns non-zero if the calling process might be mulithreaded, returns 0 if
+ * it definitely isn't (i.e. if reimplementing this function for other
+ * architectures and platforms, you can safely just return 1).
+ */
+static inline int ci_is_multithreaded (void) {
+
+ while (1) {
+ if (ci_glibc_gs_is_multihreaded_offset >= 0) {
+ /* NPTL keeps a variable that tells us this hanging off gs (i.e. in thread-
+ * local storage); just return this
+ */
+ int r;
+ __asm__ __volatile__ ("movl %%gs:(%1), %0"
+ : "=r" (r)
+ : "r" (ci_glibc_gs_is_multihreaded_offset));
+ return r;
+ }
+
+ if (ci_glibc_gs_is_multihreaded_offset == -2) {
+ /* This means we've already determined that the libc version is NOT good
+ * for our funky "is multithreaded" hack
+ */
+ return 1;
+ }
+
+ /* If we get here, it means this is the first time the function has been
+ * called -- detect the libc version and go around again.
+ */
+ ci_glibc_gs_is_multihreaded_offset = ci_glibc_gs_get_is_multihreaded_offset ();
+
+ /* Go around again. We do the test here rather than at the top so that we go
+ * quicker in the common the case
+ */
+ }
+}
+
+#else /* def __GLIBC__ */
+
+#define ci_is_multithreaded() 1 /* ?? Is the the POSIX way of finding out */
+ /* whether the appication is single */
+ /* threaded? */
+
+#endif /* def __GLIBC__ */
+
+#else /* defined __x86_64__ */
+
+static inline int ci_is_multithreaded (void) {
+ /* Now easy way to tell on x86_64; so assume we're multithreaded */
+ return 1;
+}
+
+#endif /* defined __x86_64__ */
+
+
+/**********************************************************************
+ * Compare and swap.
+ */
+
+#define CI_HAVE_COMPARE_AND_SWAP
+
+ci_inline int ci_cas32_succeed(volatile ci_int32* p, ci_int32 oldval,
+ ci_int32 newval) {
+ char ret;
+ ci_int32 prevval;
+ __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+
+ci_inline int ci_cas32_fail(volatile ci_int32* p, ci_int32 oldval,
+ ci_int32 newval) {
+ char ret;
+ ci_int32 prevval;
+ __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+
+#ifdef __x86_64__
+ci_inline int ci_cas64_succeed(volatile ci_int64* p, ci_int64 oldval,
+ ci_int64 newval) {
+ char ret;
+ ci_int64 prevval;
+ __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+
+ci_inline int ci_cas64_fail(volatile ci_int64* p, ci_int64 oldval,
+ ci_int64 newval) {
+ char ret;
+ ci_int64 prevval;
+ __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+#endif
+
+ci_inline int ci_cas32u_succeed(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
+ char ret;
+ ci_uint32 prevval;
+ __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+
+ci_inline int ci_cas32u_fail(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
+ char ret;
+ ci_uint32 prevval;
+ __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+
+ci_inline int ci_cas64u_succeed(volatile ci_uint64* p, ci_uint64 oldval,
+ ci_uint64 newval) {
+ char ret;
+ ci_uint64 prevval;
+ __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+
+ci_inline int ci_cas64u_fail(volatile ci_uint64* p, ci_uint64 oldval,
+ ci_uint64 newval) {
+ char ret;
+ ci_uint64 prevval;
+ __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
+ : "=q"(ret), "+m"(*p), "=a"(prevval)
+ : "r"(newval), "a"(oldval));
+ return ret;
+}
+
+#ifdef __x86_64__
+
+# define ci_cas_uintptr_succeed(p,o,n) \
+ ci_cas64u_succeed((volatile ci_uint64*) (p), (o), (n))
+# define ci_cas_uintptr_fail(p,o,n) \
+ ci_cas64u_fail((volatile ci_uint64*) (p), (o), (n))
+
+#else
+
+# define ci_cas_uintptr_succeed(p,o,n) \
+ ci_cas32u_succeed((volatile ci_uint32*) (p), (o), (n))
+# define ci_cas_uintptr_fail(p,o,n) \
+ ci_cas32u_fail((volatile ci_uint32*) (p), (o), (n))
+
+#endif
+
+
+/**********************************************************************
+ * Atomic bit field.
+ */
+
+typedef ci_uint32 ci_bits;
+#define CI_BITS_N 32u
+
+#define CI_BITS_DECLARE(name, n) \
+ ci_bits name[((n) + CI_BITS_N - 1u) / CI_BITS_N]
+
+ci_inline void ci_bits_clear_all(volatile ci_bits* b, int n_bits)
+{ memset((void*) b, 0, (n_bits+CI_BITS_N-1u) / CI_BITS_N * sizeof(ci_bits)); }
+
+ci_inline void ci_bit_set(volatile ci_bits* b, int i) {
+ __asm__ __volatile__("lock; btsl %1, %0"
+ : "=m" (*b)
+ : "Ir" (i));
+}
+
+ci_inline void ci_bit_clear(volatile ci_bits* b, int i) {
+ __asm__ __volatile__("lock; btrl %1, %0"
+ : "=m" (*b)
+ : "Ir" (i));
+}
+
+ci_inline int ci_bit_test(volatile ci_bits* b, int i) {
+ char rc;
+ __asm__("btl %2, %1; setc %0"
+ : "=r" (rc)
+ : "m" (*b), "Ir" (i));
+ return rc;
+}
+
+ci_inline int ci_bit_test_and_set(volatile ci_bits* b, int i) {
+ char rc;
+ __asm__ __volatile__("lock; btsl %2, %1; setc %0"
+ : "=r" (rc), "+m" (*b)
+ : "Ir" (i));
+ return rc;
+}
+
+ci_inline int ci_bit_test_and_clear(volatile ci_bits* b, int i) {
+ char rc;
+ __asm__ __volatile__("lock; btrl %2, %1; setc %0"
+ : "=r" (rc), "+m" (*b)
+ : "Ir" (i));
+ return rc;
+}
+
+/* These mask ops only work within a single ci_bits word. */
+#define ci_bit_mask_set(b,m) ci_atomic32_or((b), (m))
+#define ci_bit_mask_clear(b,m) ci_atomic32_and((b), ~(m))
+
+
+/**********************************************************************
+ * Misc.
+ */
+
+#if __GNUC__ >= 3
+# define ci_spinloop_pause() __asm__("pause")
+#else
+# define ci_spinloop_pause() __asm__(".byte 0xf3, 0x90")
+#endif
+
+
+#define CI_HAVE_ADDC32
+#define ci_add_carry32(sum, v) __asm__("addl %1, %0 ;" \
+ "adcl $0, %0 ;" \
+ : "=r" (sum) \
+ : "g" ((ci_uint32) v), "0" (sum))
+
+
+#endif /* __CI_TOOLS_GCC_X86_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+
+/*! \cidoxg_include_ci_tools_platform */
+
+#ifndef __CI_TOOLS_LINUX_KERNEL_H__
+#define __CI_TOOLS_LINUX_KERNEL_H__
+
+/**********************************************************************
+ * Need to know the kernel version.
+ */
+
+#ifndef LINUX_VERSION_CODE
+# include <linux/version.h>
+# ifndef UTS_RELEASE
+ /* 2.6.18 onwards defines UTS_RELEASE in a separate header */
+# include <linux/utsrelease.h>
+# endif
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) || \
+ LINUX_VERSION_CODE >= KERNEL_VERSION(2,7,0)
+# error "Linux 2.6 required"
+#endif
+
+
+#include <linux/slab.h> /* kmalloc / kfree */
+#include <linux/vmalloc.h> /* vmalloc / vfree */
+#include <linux/interrupt.h>/* in_interrupt() */
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <linux/ctype.h>
+#include <linux/uio.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/kmap_types.h>
+#include <asm/semaphore.h>
+
+#include <ci/tools/config.h>
+
+#define ci_in_irq in_irq
+#define ci_in_interrupt in_interrupt
+#define ci_in_atomic in_atomic
+
+
+/**********************************************************************
+ * Misc stuff.
+ */
+
+#ifdef BUG
+# define CI_BOMB BUG
+#endif
+
+ci_inline void* __ci_alloc(size_t n)
+{ return kmalloc(n, (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)); }
+
+ci_inline void* __ci_atomic_alloc(size_t n)
+{ return kmalloc(n, GFP_ATOMIC ); }
+
+ci_inline void __ci_free(void* p) { return kfree(p); }
+ci_inline void* __ci_vmalloc(size_t n) { return vmalloc(n); }
+ci_inline void __ci_vfree(void* p) { return vfree(p); }
+
+
+#if CI_MEMLEAK_DEBUG_ALLOC_TABLE
+ #define ci_alloc(s) ci_alloc_memleak_debug (s, __FILE__, __LINE__)
+ #define ci_atomic_alloc(s) ci_atomic_alloc_memleak_debug(s, __FILE__, __LINE__)
+ #define ci_free ci_free_memleak_debug
+ #define ci_vmalloc(s) ci_vmalloc_memleak_debug (s, __FILE__,__LINE__)
+ #define ci_vfree ci_vfree_memleak_debug
+ #define ci_alloc_fn ci_alloc_fn_memleak_debug
+ #define ci_vmalloc_fn ci_vmalloc_fn_memleak_debug
+#else /* !CI_MEMLEAK_DEBUG_ALLOC_TABLE */
+ #define ci_alloc_fn __ci_alloc
+ #define ci_vmalloc_fn __ci_vmalloc
+#endif
+
+#ifndef ci_alloc
+ #define ci_atomic_alloc __ci_atomic_alloc
+ #define ci_alloc __ci_alloc
+ #define ci_free __ci_free
+ #define ci_vmalloc __ci_vmalloc
+ #define ci_vmalloc_fn __ci_vmalloc
+ #define ci_vfree __ci_vfree
+#endif
+
+#define ci_sprintf sprintf
+#define ci_vsprintf vsprintf
+#define ci_snprintf snprintf
+#define ci_vsnprintf vsnprintf
+#define ci_sscanf sscanf
+
+
+#define CI_LOG_FN_DEFAULT ci_log_syslog
+
+
+/*--------------------------------------------------------------------
+ *
+ * irqs_disabled - needed for kmap helpers on some kernels
+ *
+ *--------------------------------------------------------------------*/
+#ifdef irqs_disabled
+# define ci_irqs_disabled irqs_disabled
+#else
+# if defined(__i386__) | defined(__x86_64__)
+# define ci_irqs_disabled(x) \
+ ({ \
+ unsigned long flags; \
+ local_save_flags(flags); \
+ !(flags & (1<<9)); \
+ })
+# else
+# error "Need to implement irqs_disabled() for your architecture"
+# endif
+#endif
+
+
+/**********************************************************************
+ * kmap helpers.
+ *
+ * Use ci_k(un)map for code paths which are not in an atomic context.
+ * For atomic code you need to use ci_k(un)map_in_atomic. This will grab
+ * one of the per-CPU kmap slots.
+ *
+ * NB in_interrupt != in_irq. If you don't know the difference then
+ * don't use kmap_in_atomic
+ *
+ * 2.4 allocates kmap slots by function. We are going to re-use the
+ * skb module's slot - we also use the same interlock
+ *
+ * 2.6 allocates kmap slots by type as well as by function. We are
+ * going to use the currently (2.6.10) unsused SOFTIRQ slot
+ *
+ */
+
+ci_inline void* ci_kmap(struct page *page) {
+ CI_DEBUG(if( ci_in_atomic() | ci_in_interrupt() | ci_in_irq() ) BUG());
+ return kmap(page);
+}
+
+ci_inline void ci_kunmap(struct page *page) {
+ kunmap(page);
+}
+
+#define CI_KM_SLOT KM_SOFTIRQ0
+
+
+typedef struct semaphore ci_semaphore_t;
+
+ci_inline void
+ci_sem_init (ci_semaphore_t *sem, int val) {
+ sema_init (sem, val);
+}
+
+ci_inline void
+ci_sem_down (ci_semaphore_t *sem) {
+ down (sem);
+}
+
+ci_inline int
+ci_sem_trydown (ci_semaphore_t *sem) {
+ return down_trylock (sem);
+}
+
+ci_inline void
+ci_sem_up (ci_semaphore_t *sem) {
+ up (sem);
+}
+
+ci_inline int
+ci_sem_get_count(ci_semaphore_t *sem) {
+ return sem->count.counter;
+}
+
+ci_inline void* ci_kmap_in_atomic(struct page *page)
+{
+ CI_DEBUG(if( ci_in_irq() ) BUG());
+
+ /* iSCSI can call without in_interrupt() but with irqs_disabled()
+ and in a context that can't sleep, so we need to check that
+ too */
+ if(ci_in_interrupt() || ci_irqs_disabled())
+ return kmap_atomic(page, CI_KM_SLOT);
+ else
+ return kmap(page);
+}
+
+ci_inline void ci_kunmap_in_atomic(struct page *page, void* kaddr)
+{
+ CI_DEBUG(if( ci_in_irq() ) BUG());
+
+ /* iSCSI can call without in_interrupt() but with irqs_disabled()
+ and in a context that can't sleep, so we need to check that
+ too */
+ if(ci_in_interrupt() || ci_irqs_disabled())
+ kunmap_atomic(kaddr, CI_KM_SLOT);
+ else
+ kunmap(page);
+}
+
+/**********************************************************************
+ * spinlock implementation: used by <ci/tools/spinlock.h>
+ */
+
+#define CI_HAVE_SPINLOCKS
+
+typedef ci_uintptr_t ci_lock_holder_t;
+#define ci_lock_thisthread (ci_lock_holder_t)current
+#define ci_lock_no_holder (ci_lock_holder_t)NULL
+
+typedef spinlock_t ci_lock_i;
+typedef spinlock_t ci_irqlock_i;
+typedef unsigned long ci_irqlock_state_t;
+
+#define IRQLOCK_CYCLES 500000
+
+#define ci_lock_ctor_i(l) spin_lock_init(l)
+#define ci_lock_dtor_i(l) do{}while(0)
+#define ci_lock_lock_i(l) spin_lock(l)
+#define ci_lock_trylock_i(l) spin_trylock(l)
+#define ci_lock_unlock_i(l) spin_unlock(l)
+
+#define ci_irqlock_ctor_i(l) spin_lock_init(l)
+#define ci_irqlock_dtor_i(l) do{}while(0)
+#define ci_irqlock_lock_i(l,s) spin_lock_irqsave(l,*(s))
+#define ci_irqlock_unlock_i(l,s) spin_unlock_irqrestore(l, *(s))
+
+
+/**********************************************************************
+ * register access
+ */
+
+#include <asm/io.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+typedef volatile void __iomem* ioaddr_t;
+#else
+typedef unsigned long ioaddr_t;
+#endif
+
+
+
+/**********************************************************************
+ * thread implementation -- kernel dependancies probably should be
+ * moved to driver/linux_kernel.h
+ */
+
+#define ci_linux_daemonize(name) daemonize(name)
+
+#include <linux/workqueue.h>
+
+
+typedef struct {
+ void* (*fn)(void* arg);
+ void* arg;
+ const char* name;
+ int thrd_id;
+ struct completion exit_event;
+ struct work_struct keventd_witem;
+} ci_kernel_thread_t;
+
+
+typedef ci_kernel_thread_t* cithread_t;
+
+
+extern int cithread_create(cithread_t* tid, void* (*fn)(void*), void* arg,
+ const char* name);
+extern int cithread_detach(cithread_t kt);
+extern int cithread_join(cithread_t kt);
+
+
+/* Kernel sysctl variables. */
+extern int sysctl_tcp_wmem[3];
+extern int sysctl_tcp_rmem[3];
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#define LINUX_HAS_SYSCTL_MEM_MAX
+extern ci_uint32 sysctl_wmem_max;
+extern ci_uint32 sysctl_rmem_max;
+#endif
+
+
+/*--------------------------------------------------------------------
+ *
+ * ci_bigbuf_t: An abstraction of a large buffer. Needed because in the
+ * Linux kernel, large buffers need to be allocated with vmalloc(), whereas
+ * smaller buffers should use kmalloc(). This abstraction chooses the
+ * appropriate mechansim.
+ *
+ *--------------------------------------------------------------------*/
+
+typedef struct {
+ char* p;
+ int is_vmalloc;
+} ci_bigbuf_t;
+
+
+ci_inline int ci_bigbuf_alloc(ci_bigbuf_t* bb, size_t bytes) {
+ if( bytes >= CI_PAGE_SIZE && ! ci_in_atomic() ) {
+ bb->is_vmalloc = 1;
+ if( (bb->p = vmalloc(bytes)) ) return 0;
+ }
+ bb->is_vmalloc = 0;
+ bb->p = kmalloc(bytes, ci_in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
+ return bb->p ? 0 : -ENOMEM;
+}
+
+ci_inline void ci_bigbuf_free(ci_bigbuf_t* bb) {
+ if( bb->is_vmalloc ) vfree(bb->p);
+ else kfree(bb->p);
+}
+
+ci_inline char* ci_bigbuf_ptr(ci_bigbuf_t* bb)
+{ return bb->p; }
+
+/**********************************************************************
+ * struct iovec abstraction (for Windows port)
+ */
+
+typedef struct iovec ci_iovec;
+
+/* Accessors for buffer/length */
+#define CI_IOVEC_BASE(i) ((i)->iov_base)
+#define CI_IOVEC_LEN(i) ((i)->iov_len)
+
+/**********************************************************************
+ * Signals
+ */
+
+ci_inline void
+ci_send_sig(int signum)
+{
+ send_sig(signum, current, 0);
+}
+
+#endif /* __CI_TOOLS_LINUX_KERNEL_H__ */
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_SYSDEP_H__
+#define __CI_TOOLS_SYSDEP_H__
+
+/* Make this header self-sufficient */
+#include <ci/compat.h>
+#include <ci/tools/log.h>
+#include <ci/tools/debug.h>
+
+
+/**********************************************************************
+ * Platform dependencies.
+ */
+
+#if defined(__KERNEL__)
+
+# if defined(__linux__)
+# include <ci/tools/platform/linux_kernel.h>
+# elif defined(_WIN32)
+# include <ci/tools/platform/win32_kernel.h>
+# elif defined(__sun__)
+# include <ci/tools/platform/sunos_kernel.h>
+# else
+# error Unknown platform.
+# endif
+
+#elif defined(_WIN32)
+
+# include <ci/tools/platform/win32.h>
+
+#elif defined(__unix__)
+
+# include <ci/tools/platform/unix.h>
+
+#else
+
+# error Unknown platform.
+
+#endif
+
+#if defined(__linux__)
+/*! Linux sendfile() support enable/disable. */
+# define CI_HAVE_SENDFILE /* provide sendfile i/f */
+
+# define CI_HAVE_OS_NOPAGE
+#endif
+
+#if defined(__sun__)
+# define CI_HAVE_SENDFILE /* provide sendfile i/f */
+# define CI_HAVE_SENDFILEV /* provide sendfilev i/f */
+
+# define CI_IOCTL_SENDFILE /* use efrm CI_SENDFILEV ioctl */
+#endif
+
+#if defined(_WIN32)
+typedef ci_uint32 ci_uerr_t; /* range of OS user-mode return codes */
+typedef ci_uint32 ci_kerr_t; /* range of OS kernel-mode return codes */
+#elif defined(__unix__)
+typedef ci_int32 ci_uerr_t; /* range of OS user-mode return codes */
+typedef ci_int32 ci_kerr_t; /* range of OS kernel-mode return codes */
+#endif
+
+
+/**********************************************************************
+ * Compiler and processor dependencies.
+ */
+
+#if defined(__GNUC__)
+
+#if defined(__i386__) || defined(__x86_64__)
+# include <ci/tools/platform/gcc_x86.h>
+#elif defined(__PPC__)
+# include <ci/tools/platform/gcc_ppc.h>
+#elif defined(__ia64__)
+# include <ci/tools/platform/gcc_ia64.h>
+#else
+# error Unknown processor.
+#endif
+
+#elif defined(_MSC_VER)
+
+#if defined(__i386__)
+# include <ci/tools/platform/msvc_x86.h>
+# elif defined(__x86_64__)
+# include <ci/tools/platform/msvc_x86_64.h>
+#else
+# error Unknown processor.
+#endif
+
+#elif defined(__PGI)
+
+# include <ci/tools/platform/pg_x86.h>
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# include <ci/tools/platform/gcc_x86.h>
+
+#else
+# error Unknown compiler.
+#endif
+
+
+#endif /* __CI_TOOLS_SYSDEP_H__ */
+
+/*! \cidoxg_end */
--- /dev/null
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netfront -Idrivers/xen/sfc_netutil -Idrivers/xen/netfront
+EXTRA_CFLAGS += -D__ci_driver__
+EXTRA_CFLAGS += -Werror
+
+ifdef GCOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) := sfc_netfront.o
+
+sfc_netfront-objs := accel_msg.o accel_bufs.o accel_netfront.o accel_vi.o accel_xenbus.o accel_tso.o accel_ssr.o accel_debugfs.o falcon_event.o falcon_vi.o pt_tx.o vi_init.o
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_H
+#define NETFRONT_ACCEL_H
+
+#include "accel_msg_iface.h"
+#include "accel_cuckoo_hash.h"
+#include "accel_bufs.h"
+
+#include "etherfabric/ef_vi.h"
+
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+enum netfront_accel_post_status {
+ NETFRONT_ACCEL_STATUS_GOOD,
+ NETFRONT_ACCEL_STATUS_BUSY,
+ NETFRONT_ACCEL_STATUS_CANT
+};
+
+#define NETFRONT_ACCEL_STATS 1
+#if NETFRONT_ACCEL_STATS
+#define NETFRONT_ACCEL_STATS_OP(x) x
+#else
+#define NETFRONT_ACCEL_STATS_OP(x)
+#endif
+
+
+enum netfront_accel_msg_state {
+ NETFRONT_ACCEL_MSG_NONE = 0,
+ NETFRONT_ACCEL_MSG_HELLO = 1,
+ NETFRONT_ACCEL_MSG_HW = 2
+};
+
+
+typedef struct {
+ u32 in_progress;
+ u32 total_len;
+ struct sk_buff *skb;
+} netfront_accel_jumbo_state;
+
+
+struct netfront_accel_ssr_state {
+ /** List of tracked connections. */
+ struct list_head conns;
+
+ /** Free efx_ssr_conn instances. */
+ struct list_head free_conns;
+};
+
+
+struct netfront_accel_netdev_stats {
+ /* Fastpath stats. */
+ u32 fastpath_rx_pkts;
+ u32 fastpath_rx_bytes;
+ u32 fastpath_rx_errors;
+ u32 fastpath_tx_pkts;
+ u32 fastpath_tx_bytes;
+ u32 fastpath_tx_errors;
+};
+
+
+struct netfront_accel_netdev_dbfs {
+ struct dentry *fastpath_rx_pkts;
+ struct dentry *fastpath_rx_bytes;
+ struct dentry *fastpath_rx_errors;
+ struct dentry *fastpath_tx_pkts;
+ struct dentry *fastpath_tx_bytes;
+ struct dentry *fastpath_tx_errors;
+};
+
+
+struct netfront_accel_stats {
+ /** Fast path events */
+ u64 fastpath_tx_busy;
+
+ /** TX DMA queue status */
+ u64 fastpath_tx_completions;
+
+ /** The number of events processed. */
+ u64 event_count;
+
+ /** Number of frame trunc events seen on fastpath */
+ u64 fastpath_frm_trunc;
+
+ /** Number of no rx descriptor trunc events seen on fastpath */
+ u64 rx_no_desc_trunc;
+
+ /** The number of misc bad events (e.g. RX_DISCARD) processed. */
+ u64 bad_event_count;
+
+ /** Number of events dealt with in poll loop */
+ u32 events_per_poll_max;
+ u32 events_per_poll_tx_max;
+ u32 events_per_poll_rx_max;
+
+ /** Largest number of concurrently outstanding tx descriptors */
+ u32 fastpath_tx_pending_max;
+
+ /** The number of events since the last interrupts. */
+ u32 event_count_since_irq;
+
+ /** The max number of events between interrupts. */
+ u32 events_per_irq_max;
+
+ /** The number of interrupts. */
+ u64 irq_count;
+
+ /** The number of useless interrupts. */
+ u64 useless_irq_count;
+
+ /** The number of polls scheduled. */
+ u64 poll_schedule_count;
+
+ /** The number of polls called. */
+ u64 poll_call_count;
+
+ /** The number of rechecks. */
+ u64 poll_reschedule_count;
+
+ /** Number of times we've called netif_stop_queue/netif_wake_queue */
+ u64 queue_stops;
+ u64 queue_wakes;
+
+ /** SSR stats */
+ u64 ssr_bursts;
+ u64 ssr_drop_stream;
+ u64 ssr_misorder;
+ u64 ssr_slow_start;
+ u64 ssr_merges;
+ u64 ssr_too_many;
+ u64 ssr_new_stream;
+};
+
+
+struct netfront_accel_dbfs {
+ struct dentry *fastpath_tx_busy;
+ struct dentry *fastpath_tx_completions;
+ struct dentry *fastpath_tx_pending_max;
+ struct dentry *fastpath_frm_trunc;
+ struct dentry *rx_no_desc_trunc;
+ struct dentry *event_count;
+ struct dentry *bad_event_count;
+ struct dentry *events_per_poll_max;
+ struct dentry *events_per_poll_rx_max;
+ struct dentry *events_per_poll_tx_max;
+ struct dentry *event_count_since_irq;
+ struct dentry *events_per_irq_max;
+ struct dentry *irq_count;
+ struct dentry *useless_irq_count;
+ struct dentry *poll_schedule_count;
+ struct dentry *poll_call_count;
+ struct dentry *poll_reschedule_count;
+ struct dentry *queue_stops;
+ struct dentry *queue_wakes;
+ struct dentry *ssr_bursts;
+ struct dentry *ssr_drop_stream;
+ struct dentry *ssr_misorder;
+ struct dentry *ssr_slow_start;
+ struct dentry *ssr_merges;
+ struct dentry *ssr_too_many;
+ struct dentry *ssr_new_stream;
+};
+
+
+typedef struct netfront_accel_vnic {
+ struct netfront_accel_vnic *next;
+
+ struct mutex vnic_mutex;
+
+ spinlock_t tx_lock;
+
+ struct netfront_accel_bufpages bufpages;
+ struct netfront_accel_bufinfo *rx_bufs;
+ struct netfront_accel_bufinfo *tx_bufs;
+
+ /** Hardware & VI state */
+ ef_vi vi;
+
+ ef_vi_state *vi_state;
+
+ ef_eventq_state evq_state;
+
+ void *evq_mapping;
+
+ /** Hardware dependant state */
+ union {
+ struct {
+ /** Falcon A or B */
+ enum net_accel_hw_type type;
+ u32 *evq_rptr;
+ u32 *doorbell;
+ void *evq_rptr_mapping;
+ void *doorbell_mapping;
+ void *txdmaq_mapping;
+ void *rxdmaq_mapping;
+ } falcon;
+ } hw;
+
+ /** RX DMA queue status */
+ u32 rx_dma_level;
+
+ /** Number of RX descriptors waiting to be pushed to the card. */
+ u32 rx_dma_batched;
+#define NETFRONT_ACCEL_RX_DESC_BATCH 16
+
+ /**
+ * Hash table of remote mac addresses to decide whether to try
+ * fast path
+ */
+ cuckoo_hash_table fastpath_table;
+ spinlock_t table_lock;
+
+ /** the local mac address of virtual interface we're accelerating */
+ u8 mac[ETH_ALEN];
+
+ int rx_pkt_stride;
+ int rx_skb_stride;
+
+ /**
+ * Keep track of fragments of jumbo packets as events are
+ * delivered by NIC
+ */
+ netfront_accel_jumbo_state jumbo_state;
+
+ struct net_device *net_dev;
+
+ /** These two gate the enabling of fast path operations */
+ int frontend_ready;
+ int backend_netdev_up;
+
+ int irq_enabled;
+ spinlock_t irq_enabled_lock;
+
+ int tx_enabled;
+
+ int poll_enabled;
+
+ /** A spare slot for a TX packet. This is treated as an extension
+ * of the DMA queue. */
+ struct sk_buff *tx_skb;
+
+ /** Keep track of fragments of SSR packets */
+ struct netfront_accel_ssr_state ssr_state;
+
+ struct xenbus_device *dev;
+
+ /** Event channel for messages */
+ int msg_channel;
+ int msg_channel_irq;
+
+ /** Event channel for network interrupts. */
+ int net_channel;
+ int net_channel_irq;
+
+ struct net_accel_shared_page *shared_page;
+
+ grant_ref_t ctrl_page_gnt;
+ grant_ref_t msg_page_gnt;
+
+ /** Message Qs, 1 each way. */
+ sh_msg_fifo2 to_dom0;
+ sh_msg_fifo2 from_dom0;
+
+ enum netfront_accel_msg_state msg_state;
+
+ /** Watch on accelstate */
+ struct xenbus_watch backend_accel_watch;
+ /** Watch on frontend's MAC address */
+ struct xenbus_watch mac_address_watch;
+
+ /** Work to process received irq/msg */
+ struct work_struct msg_from_bend;
+
+ /** Wait queue for changes in accelstate. */
+ wait_queue_head_t state_wait_queue;
+
+ /** The current accelstate of this driver. */
+ XenbusState frontend_state;
+
+ /** The most recent accelstate seen by the xenbus watch. */
+ XenbusState backend_state;
+
+ /** Non-zero if we should reject requests to connect. */
+ int removing;
+
+ /** Non-zero if the domU shared state has been initialised. */
+ int domU_state_is_setup;
+
+ /** Non-zero if the dom0 shared state has been initialised. */
+ int dom0_state_is_setup;
+
+ /* Those statistics that are added to the netdev stats */
+ struct netfront_accel_netdev_stats netdev_stats;
+ struct netfront_accel_netdev_stats stats_last_read;
+#ifdef CONFIG_DEBUG_FS
+ struct netfront_accel_netdev_dbfs netdev_dbfs;
+#endif
+
+ /* These statistics are internal and optional */
+#if NETFRONT_ACCEL_STATS
+ struct netfront_accel_stats stats;
+#ifdef CONFIG_DEBUG_FS
+ struct netfront_accel_dbfs dbfs;
+#endif
+#endif
+
+ /** Debufs fs dir for this interface */
+ struct dentry *dbfs_dir;
+} netfront_accel_vnic;
+
+
+/* Module parameters */
+extern unsigned sfc_netfront_max_pages;
+extern unsigned sfc_netfront_buffer_split;
+
+extern const char *frontend_name;
+extern struct netfront_accel_hooks accel_hooks;
+extern struct workqueue_struct *netfront_accel_workqueue;
+
+
+extern
+void netfront_accel_vi_ctor(netfront_accel_vnic *vnic);
+
+extern
+int netfront_accel_vi_init(netfront_accel_vnic *vnic,
+ struct net_accel_msg_hw *hw_msg);
+
+extern
+void netfront_accel_vi_dtor(netfront_accel_vnic *vnic);
+
+
+/**
+ * Add new buffers which have been registered with the NIC.
+ *
+ * @v vnic The vnic instance to process the response.
+ *
+ * The buffers contained in the message are added to the buffer pool.
+ */
+extern
+void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx);
+
+/**
+ * Put a packet on the tx DMA queue.
+ *
+ * @v vnic The vnic instance to accept the packet.
+ * @v skb A sk_buff to send.
+ *
+ * Attempt to send a packet. On success, the skb is owned by the DMA
+ * queue and will be released when the completion event arrives.
+ */
+extern enum netfront_accel_post_status
+netfront_accel_vi_tx_post(netfront_accel_vnic *vnic,
+ struct sk_buff *skb);
+
+
+/**
+ * Process events in response to an interrupt.
+ *
+ * @v vnic The vnic instance to poll.
+ * @v rx_packets The maximum number of rx packets to process.
+ * @ret rx_done The number of rx packets processed.
+ *
+ * The vnic will process events until there are no more events
+ * remaining or the specified number of rx packets has been processed.
+ * The split from the interrupt call is to allow Linux NAPI
+ * polling.
+ */
+extern
+int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets);
+
+
+/**
+ * Iterate over the fragments of a packet buffer.
+ *
+ * @v skb The packet buffer to examine.
+ * @v idx A variable name for the fragment index.
+ * @v data A variable name for the address of the fragment data.
+ * @v length A variable name for the fragment length.
+ * @v code A section of code to execute for each fragment.
+ *
+ * This macro iterates over the fragments in a packet buffer and
+ * executes the code for each of them.
+ */
+#define NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT(skb, frag_idx, \
+ frag_data, frag_len, \
+ code) \
+ do { \
+ int frag_idx; \
+ void *frag_data; \
+ unsigned int frag_len; \
+ \
+ frag_data = skb->data; \
+ frag_len = skb_headlen(skb); \
+ frag_idx = 0; \
+ while (1) { /* For each fragment */ \
+ code; \
+ if (frag_idx >= skb_shinfo(skb)->nr_frags) { \
+ break; \
+ } else { \
+ skb_frag_t *fragment; \
+ fragment = &skb_shinfo(skb)->frags[frag_idx]; \
+ frag_len = fragment->size; \
+ frag_data = ((void*)page_address(fragment->page) \
+ + fragment->page_offset); \
+ }; \
+ frag_idx++; \
+ } \
+ } while(0)
+
+static inline
+void netfront_accel_disable_net_interrupts(netfront_accel_vnic *vnic)
+{
+ mask_evtchn(vnic->net_channel);
+}
+
+static inline
+void netfront_accel_enable_net_interrupts(netfront_accel_vnic *vnic)
+{
+ unmask_evtchn(vnic->net_channel);
+}
+
+void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
+ u32 ip, u16 port, u8 protocol);
+
+/* Process an IRQ received from back end driver */
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+extern void netfront_accel_msg_from_bend(struct work_struct *context);
+#else
+extern void netfront_accel_msg_from_bend(void *context);
+#endif
+
+extern void vnic_stop_fastpath(netfront_accel_vnic *vnic);
+
+extern int netfront_accel_probe(struct net_device *net_dev,
+ struct xenbus_device *dev);
+extern int netfront_accel_remove(struct xenbus_device *dev);
+extern void netfront_accel_set_closing(netfront_accel_vnic *vnic);
+
+extern int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic);
+
+extern void netfront_accel_debugfs_init(void);
+extern void netfront_accel_debugfs_fini(void);
+extern int netfront_accel_debugfs_create(netfront_accel_vnic *vnic);
+extern int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic);
+
+#endif /* NETFRONT_ACCEL_H */
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <xen/gnttab.h>
+
+#include "accel_bufs.h"
+#include "accel_util.h"
+
+#include "accel.h"
+
+
+static int
+netfront_accel_alloc_buf_desc_blocks(struct netfront_accel_bufinfo *manager,
+ int pages)
+{
+ manager->desc_blocks =
+ kzalloc(sizeof(struct netfront_accel_pkt_desc *) *
+ NETFRONT_ACCEL_BUF_NUM_BLOCKS(pages), GFP_KERNEL);
+ if (manager->desc_blocks == NULL) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int
+netfront_accel_alloc_buf_lists(struct netfront_accel_bufpages *bufpages,
+ int pages)
+{
+ bufpages->page_list = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+ if (bufpages->page_list == NULL) {
+ return -ENOMEM;
+ }
+
+ bufpages->grant_list = kzalloc(pages * sizeof(grant_ref_t), GFP_KERNEL);
+ if (bufpages->grant_list == NULL) {
+ kfree(bufpages->page_list);
+ bufpages->page_list = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+
+int netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
+ struct netfront_accel_bufinfo *rx_manager,
+ struct netfront_accel_bufinfo *tx_manager,
+ int pages)
+{
+ int n, rc;
+
+ if ((rc = netfront_accel_alloc_buf_desc_blocks
+ (rx_manager, pages - (pages / sfc_netfront_buffer_split))) < 0) {
+ goto rx_fail;
+ }
+
+ if ((rc = netfront_accel_alloc_buf_desc_blocks
+ (tx_manager, pages / sfc_netfront_buffer_split)) < 0) {
+ goto tx_fail;
+ }
+
+ if ((rc = netfront_accel_alloc_buf_lists(bufpages, pages)) < 0) {
+ goto lists_fail;
+ }
+
+ for (n = 0; n < pages; n++) {
+ void *tmp = (void*)__get_free_page(GFP_KERNEL);
+ if (tmp == NULL)
+ break;
+
+ bufpages->page_list[n] = tmp;
+ }
+
+ if (n != pages) {
+ EPRINTK("%s: not enough pages: %d != %d\n", __FUNCTION__, n,
+ pages);
+ for (; n >= 0; n--)
+ free_page((unsigned long)(bufpages->page_list[n]));
+ rc = -ENOMEM;
+ goto pages_fail;
+ }
+
+ bufpages->max_pages = pages;
+ bufpages->page_reqs = 0;
+
+ return 0;
+
+ pages_fail:
+ kfree(bufpages->page_list);
+ kfree(bufpages->grant_list);
+
+ bufpages->page_list = NULL;
+ bufpages->grant_list = NULL;
+ lists_fail:
+ kfree(tx_manager->desc_blocks);
+ tx_manager->desc_blocks = NULL;
+
+ tx_fail:
+ kfree(rx_manager->desc_blocks);
+ rx_manager->desc_blocks = NULL;
+ rx_fail:
+ return rc;
+}
+
+
+void netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
+ struct netfront_accel_bufinfo *rx_manager,
+ struct netfront_accel_bufinfo *tx_manager)
+{
+ int i;
+
+ for (i = 0; i < bufpages->max_pages; i++) {
+ if (bufpages->grant_list[i] != 0)
+ net_accel_ungrant_page(bufpages->grant_list[i]);
+ free_page((unsigned long)(bufpages->page_list[i]));
+ }
+
+ if (bufpages->max_pages) {
+ kfree(bufpages->page_list);
+ kfree(bufpages->grant_list);
+ kfree(rx_manager->desc_blocks);
+ kfree(tx_manager->desc_blocks);
+ }
+}
+
+
+/*
+ * Allocate memory for the buffer manager and create a lock. If no
+ * lock is supplied its own is allocated.
+ */
+struct netfront_accel_bufinfo *netfront_accel_init_bufs(spinlock_t *lock)
+{
+ struct netfront_accel_bufinfo *res = kmalloc(sizeof(*res), GFP_KERNEL);
+ if (res != NULL) {
+ res->npages = res->nused = 0;
+ res->first_free = -1;
+
+ if (lock == NULL) {
+ res->lock = kmalloc(sizeof(*res->lock), GFP_KERNEL);
+ if (res->lock == NULL) {
+ kfree(res);
+ return NULL;
+ }
+ spin_lock_init(res->lock);
+ res->internally_locked = 1;
+ } else {
+ res->lock = lock;
+ res->internally_locked = 0;
+ }
+
+ res->desc_blocks = NULL;
+ }
+
+ return res;
+}
+
+
+void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *bufs)
+{
+ if (bufs->internally_locked)
+ kfree(bufs->lock);
+ kfree(bufs);
+}
+
+
+int netfront_accel_buf_map_request(struct xenbus_device *dev,
+ struct netfront_accel_bufpages *bufpages,
+ struct net_accel_msg *msg,
+ int pages, int offset)
+{
+ int i, mfn;
+ int err;
+
+ net_accel_msg_init(msg, NET_ACCEL_MSG_MAPBUF);
+
+ BUG_ON(pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
+
+ msg->u.mapbufs.pages = pages;
+
+ for (i = 0; i < msg->u.mapbufs.pages; i++) {
+ /*
+ * This can happen if we tried to send this message
+ * earlier but the queue was full.
+ */
+ if (bufpages->grant_list[offset+i] != 0) {
+ msg->u.mapbufs.grants[i] =
+ bufpages->grant_list[offset+i];
+ continue;
+ }
+
+ mfn = virt_to_mfn(bufpages->page_list[offset+i]);
+ VPRINTK("%s: Granting page %d, mfn %08x\n",
+ __FUNCTION__, i, mfn);
+
+ bufpages->grant_list[offset+i] =
+ net_accel_grant_page(dev, mfn, 0);
+ msg->u.mapbufs.grants[i] = bufpages->grant_list[offset+i];
+
+ if (msg->u.mapbufs.grants[i] < 0) {
+ EPRINTK("%s: Failed to grant buffer: %d\n",
+ __FUNCTION__, msg->u.mapbufs.grants[i]);
+ err = -EIO;
+ goto error;
+ }
+ }
+
+ /* This is interpreted on return as the offset in the the page_list */
+ msg->u.mapbufs.reqid = offset;
+
+ return 0;
+
+error:
+ /* Ungrant all the pages we've successfully granted. */
+ for (i--; i >= 0; i--) {
+ net_accel_ungrant_page(bufpages->grant_list[offset+i]);
+ bufpages->grant_list[offset+i] = 0;
+ }
+ return err;
+}
+
+
+/* Process a response to a buffer request. */
+int netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
+ struct netfront_accel_bufinfo *manager,
+ struct net_accel_msg *msg)
+{
+ int msg_pages, page_offset, i, newtot;
+ int old_block_count, new_block_count;
+ u32 msg_buf;
+ unsigned long flags;
+
+ VPRINTK("%s: manager %p msg %p\n", __FUNCTION__, manager, msg);
+
+ BUG_ON(msg->id != (NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY));
+
+ msg_pages = msg->u.mapbufs.pages;
+ msg_buf = msg->u.mapbufs.buf;
+ page_offset = msg->u.mapbufs.reqid;
+
+ spin_lock_irqsave(manager->lock, flags);
+ newtot = manager->npages + msg_pages;
+ old_block_count =
+ (manager->npages + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
+ NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+ new_block_count =
+ (newtot + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
+ NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+
+ for (i = old_block_count; i < new_block_count; i++) {
+ struct netfront_accel_pkt_desc *block;
+ if (manager->desc_blocks[i] != NULL) {
+ VPRINTK("Not needed\n");
+ continue;
+ }
+ block = kzalloc(NETFRONT_ACCEL_BUFS_PER_BLOCK *
+ sizeof(netfront_accel_pkt_desc), GFP_ATOMIC);
+ if (block == NULL) {
+ spin_unlock_irqrestore(manager->lock, flags);
+ return -ENOMEM;
+ }
+ manager->desc_blocks[i] = block;
+ }
+ for (i = manager->npages; i < newtot; i++) {
+ int k, j = i - manager->npages;
+ int block_num;
+ int block_idx;
+ struct netfront_accel_pkt_desc *pkt;
+
+ block_num = i >> NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+ block_idx = (NETFRONT_ACCEL_BUFS_PER_PAGE*i)
+ & (NETFRONT_ACCEL_BUFS_PER_BLOCK-1);
+
+ pkt = manager->desc_blocks[block_num] + block_idx;
+
+ for (k = 0; k < NETFRONT_ACCEL_BUFS_PER_PAGE; k++) {
+ BUG_ON(page_offset + j >= bufpages->max_pages);
+
+ pkt[k].buf_id = NETFRONT_ACCEL_BUFS_PER_PAGE * i + k;
+ pkt[k].pkt_kva = bufpages->page_list[page_offset + j] +
+ (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * k;
+ pkt[k].pkt_buff_addr = msg_buf +
+ (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) *
+ (NETFRONT_ACCEL_BUFS_PER_PAGE * j + k);
+ pkt[k].next_free = manager->first_free;
+ manager->first_free = pkt[k].buf_id;
+ *(int*)(pkt[k].pkt_kva) = pkt[k].buf_id;
+
+ VPRINTK("buf %d desc %p kva %p buffaddr %x\n",
+ pkt[k].buf_id, &(pkt[k]), pkt[k].pkt_kva,
+ pkt[k].pkt_buff_addr);
+ }
+ }
+ manager->npages = newtot;
+ spin_unlock_irqrestore(manager->lock, flags);
+ VPRINTK("Added %d pages. Total is now %d\n", msg_pages,
+ manager->npages);
+ return 0;
+}
+
+
+netfront_accel_pkt_desc *
+netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id)
+{
+ netfront_accel_pkt_desc *pkt;
+ int block_num = id >> NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT;
+ int block_idx = id & (NETFRONT_ACCEL_BUFS_PER_BLOCK - 1);
+ BUG_ON(id >= manager->npages * NETFRONT_ACCEL_BUFS_PER_PAGE);
+ BUG_ON(block_idx >= NETFRONT_ACCEL_BUFS_PER_BLOCK);
+ pkt = manager->desc_blocks[block_num] + block_idx;
+ return pkt;
+}
+
+
+/* Allocate a buffer from the buffer manager */
+netfront_accel_pkt_desc *
+netfront_accel_buf_get(struct netfront_accel_bufinfo *manager)
+{
+ int bufno = -1;
+ netfront_accel_pkt_desc *buf = NULL;
+ unsigned long flags = 0;
+
+ /* Any spare? */
+ if (manager->first_free == -1)
+ return NULL;
+ /* Take lock */
+ if (manager->internally_locked)
+ spin_lock_irqsave(manager->lock, flags);
+ bufno = manager->first_free;
+ if (bufno != -1) {
+ buf = netfront_accel_buf_find(manager, bufno);
+ manager->first_free = buf->next_free;
+ manager->nused++;
+ }
+ /* Release lock */
+ if (manager->internally_locked)
+ spin_unlock_irqrestore(manager->lock, flags);
+
+ /* Tell the world */
+ VPRINTK("Allocated buffer %i, buffaddr %x\n", bufno,
+ buf->pkt_buff_addr);
+
+ return buf;
+}
+
+
+/* Release a buffer back to the buffer manager pool */
+int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, u16 id)
+{
+ netfront_accel_pkt_desc *buf = netfront_accel_buf_find(manager, id);
+ unsigned long flags = 0;
+ unsigned was_empty = 0;
+ int bufno = id;
+
+ VPRINTK("Freeing buffer %i\n", id);
+ BUG_ON(id == (u16)-1);
+
+ if (manager->internally_locked)
+ spin_lock_irqsave(manager->lock, flags);
+
+ if (manager->first_free == -1)
+ was_empty = 1;
+
+ buf->next_free = manager->first_free;
+ manager->first_free = bufno;
+ manager->nused--;
+
+ if (manager->internally_locked)
+ spin_unlock_irqrestore(manager->lock, flags);
+
+ return was_empty;
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_BUFS_H
+#define NETFRONT_ACCEL_BUFS_H
+
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <xen/xenbus.h>
+
+#include "accel_msg_iface.h"
+
+
+/*! Buffer descriptor structure */
+typedef struct netfront_accel_pkt_desc {
+ int buf_id;
+ u32 pkt_buff_addr;
+ void *pkt_kva;
+ /* This is the socket buffer currently married to this buffer */
+ struct sk_buff *skb;
+ int next_free;
+} netfront_accel_pkt_desc;
+
+
+#define NETFRONT_ACCEL_DEFAULT_BUF_PAGES (384)
+#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT (4)
+#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK \
+ (1 << (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT))
+#define NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT (1)
+#define NETFRONT_ACCEL_BUFS_PER_PAGE \
+ (1 << (NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT))
+#define NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT \
+ (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT + \
+ NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT)
+#define NETFRONT_ACCEL_BUFS_PER_BLOCK \
+ (1 << NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT)
+#define NETFRONT_ACCEL_BUF_NUM_BLOCKS(max_pages) \
+ (((max_pages)+NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK-1) / \
+ NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK)
+
+/*! Buffer management structure. */
+struct netfront_accel_bufinfo {
+ /* number added to this manager */
+ unsigned npages;
+ /* number currently used from this manager */
+ unsigned nused;
+
+ int first_free;
+
+ int internally_locked;
+ spinlock_t *lock;
+
+ /*
+ * array of pointers (length NETFRONT_ACCEL_BUF_NUM_BLOCKS) to
+ * pkt descs
+ */
+ struct netfront_accel_pkt_desc **desc_blocks;
+};
+
+
+struct netfront_accel_bufpages {
+ /* length of lists of pages/grants */
+ int max_pages;
+ /* list of pages allocated for network buffers */
+ void **page_list;
+ /* list of grants for the above pages */
+ grant_ref_t *grant_list;
+
+ /* number of page requests that have been made */
+ unsigned page_reqs;
+};
+
+
+/*! Allocate memory for the buffer manager, set up locks etc.
+ * Optionally takes a lock to use, if not supplied it makes its own.
+ *
+ * \return pointer to netfront_accel_bufinfo structure that represents the
+ * buffer manager
+ */
+extern struct netfront_accel_bufinfo *
+netfront_accel_init_bufs(spinlock_t *lock);
+
+/*! Allocate memory for the buffers
+ */
+extern int
+netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
+ struct netfront_accel_bufinfo *rx_res,
+ struct netfront_accel_bufinfo *tx_res,
+ int pages);
+extern void
+netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
+ struct netfront_accel_bufinfo *rx_res,
+ struct netfront_accel_bufinfo *tx_res);
+
+/*! Release memory for the buffer manager, buffers, etc.
+ *
+ * \param manager pointer to netfront_accel_bufinfo structure that
+ * represents the buffer manager
+ */
+extern void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *manager);
+
+/*! Release a buffer.
+ *
+ * \param manager The buffer manager which owns the buffer.
+ * \param id The buffer identifier.
+ */
+extern int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager,
+ u16 id);
+
+/*! Get the packet descriptor associated with a buffer id.
+ *
+ * \param manager The buffer manager which owns the buffer.
+ * \param id The buffer identifier.
+ *
+ * The returned value is the packet descriptor for this buffer.
+ */
+extern netfront_accel_pkt_desc *
+netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id);
+
+
+/*! Fill out a message request for some buffers to be mapped by the
+ * back end driver
+ *
+ * \param manager The buffer manager
+ * \param msg Pointer to an ef_msg to complete.
+ * \return 0 on success
+ */
+extern int
+netfront_accel_buf_map_request(struct xenbus_device *dev,
+ struct netfront_accel_bufpages *bufpages,
+ struct net_accel_msg *msg,
+ int pages, int offset);
+
+/*! Process a response to a buffer request.
+ *
+ * Deal with a received message from the back end in response to our
+ * request for buffers
+ *
+ * \param manager The buffer manager
+ * \param msg The received message from the back end describing new
+ * buffers
+ * \return 0 on success
+ */
+extern int
+netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
+ struct netfront_accel_bufinfo *manager,
+ struct net_accel_msg *msg);
+
+
+/*! Allocate a buffer from the buffer manager
+ *
+ * \param manager The buffer manager data structure
+ * \param id On exit, the id of the buffer allocated
+ * \return Pointer to buffer descriptor.
+ */
+struct netfront_accel_pkt_desc *
+netfront_accel_buf_get(struct netfront_accel_bufinfo *manager);
+
+#endif /* NETFRONT_ACCEL_BUFS_H */
+
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+
+#include "accel.h"
+
+#if defined(CONFIG_DEBUG_FS)
+static struct dentry *sfc_debugfs_root = NULL;
+#endif
+
+void netfront_accel_debugfs_init(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+ sfc_debugfs_root = debugfs_create_dir(frontend_name, NULL);
+#endif
+}
+
+
+void netfront_accel_debugfs_fini(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+ if (sfc_debugfs_root)
+ debugfs_remove(sfc_debugfs_root);
+#endif
+}
+
+
+int netfront_accel_debugfs_create(netfront_accel_vnic *vnic)
+{
+#if defined(CONFIG_DEBUG_FS)
+ if (sfc_debugfs_root == NULL)
+ return -ENOENT;
+
+ vnic->dbfs_dir = debugfs_create_dir(vnic->net_dev->name,
+ sfc_debugfs_root);
+ if (vnic->dbfs_dir == NULL)
+ return -ENOMEM;
+
+ vnic->netdev_dbfs.fastpath_rx_pkts = debugfs_create_u32
+ ("fastpath_rx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_pkts);
+ vnic->netdev_dbfs.fastpath_rx_bytes = debugfs_create_u32
+ ("fastpath_rx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_bytes);
+ vnic->netdev_dbfs.fastpath_rx_errors = debugfs_create_u32
+ ("fastpath_rx_errors", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_errors);
+ vnic->netdev_dbfs.fastpath_tx_pkts = debugfs_create_u32
+ ("fastpath_tx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_pkts);
+ vnic->netdev_dbfs.fastpath_tx_bytes = debugfs_create_u32
+ ("fastpath_tx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_bytes);
+ vnic->netdev_dbfs.fastpath_tx_errors = debugfs_create_u32
+ ("fastpath_tx_errors", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_errors);
+
+#if NETFRONT_ACCEL_STATS
+ vnic->dbfs.irq_count = debugfs_create_u64
+ ("irq_count", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.irq_count);
+ vnic->dbfs.useless_irq_count = debugfs_create_u64
+ ("useless_irq_count", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.useless_irq_count);
+ vnic->dbfs.poll_schedule_count = debugfs_create_u64
+ ("poll_schedule_count", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.poll_schedule_count);
+ vnic->dbfs.poll_call_count = debugfs_create_u64
+ ("poll_call_count", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.poll_call_count);
+ vnic->dbfs.poll_reschedule_count = debugfs_create_u64
+ ("poll_reschedule_count", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.poll_reschedule_count);
+ vnic->dbfs.queue_stops = debugfs_create_u64
+ ("queue_stops", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.queue_stops);
+ vnic->dbfs.queue_wakes = debugfs_create_u64
+ ("queue_wakes", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.queue_wakes);
+ vnic->dbfs.ssr_bursts = debugfs_create_u64
+ ("ssr_bursts", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.ssr_bursts);
+ vnic->dbfs.ssr_drop_stream = debugfs_create_u64
+ ("ssr_drop_stream", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.ssr_drop_stream);
+ vnic->dbfs.ssr_misorder = debugfs_create_u64
+ ("ssr_misorder", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.ssr_misorder);
+ vnic->dbfs.ssr_slow_start = debugfs_create_u64
+ ("ssr_slow_start", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.ssr_slow_start);
+ vnic->dbfs.ssr_merges = debugfs_create_u64
+ ("ssr_merges", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.ssr_merges);
+ vnic->dbfs.ssr_too_many = debugfs_create_u64
+ ("ssr_too_many", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.ssr_too_many);
+ vnic->dbfs.ssr_new_stream = debugfs_create_u64
+ ("ssr_new_stream", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.ssr_new_stream);
+
+ vnic->dbfs.fastpath_tx_busy = debugfs_create_u64
+ ("fastpath_tx_busy", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.fastpath_tx_busy);
+ vnic->dbfs.fastpath_tx_completions = debugfs_create_u64
+ ("fastpath_tx_completions", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.fastpath_tx_completions);
+ vnic->dbfs.fastpath_tx_pending_max = debugfs_create_u32
+ ("fastpath_tx_pending_max", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.fastpath_tx_pending_max);
+ vnic->dbfs.event_count = debugfs_create_u64
+ ("event_count", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.event_count);
+ vnic->dbfs.bad_event_count = debugfs_create_u64
+ ("bad_event_count", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.bad_event_count);
+ vnic->dbfs.event_count_since_irq = debugfs_create_u32
+ ("event_count_since_irq", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.event_count_since_irq);
+ vnic->dbfs.events_per_irq_max = debugfs_create_u32
+ ("events_per_irq_max", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.events_per_irq_max);
+ vnic->dbfs.fastpath_frm_trunc = debugfs_create_u64
+ ("fastpath_frm_trunc", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.fastpath_frm_trunc);
+ vnic->dbfs.rx_no_desc_trunc = debugfs_create_u64
+ ("rx_no_desc_trunc", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.rx_no_desc_trunc);
+ vnic->dbfs.events_per_poll_max = debugfs_create_u32
+ ("events_per_poll_max", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.events_per_poll_max);
+ vnic->dbfs.events_per_poll_rx_max = debugfs_create_u32
+ ("events_per_poll_rx_max", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.events_per_poll_rx_max);
+ vnic->dbfs.events_per_poll_tx_max = debugfs_create_u32
+ ("events_per_poll_tx_max", S_IRUSR | S_IRGRP | S_IROTH,
+ vnic->dbfs_dir, &vnic->stats.events_per_poll_tx_max);
+#endif
+#endif
+ return 0;
+}
+
+
+int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic)
+{
+#if defined(CONFIG_DEBUG_FS)
+ if (vnic->dbfs_dir != NULL) {
+ debugfs_remove(vnic->netdev_dbfs.fastpath_rx_pkts);
+ debugfs_remove(vnic->netdev_dbfs.fastpath_rx_bytes);
+ debugfs_remove(vnic->netdev_dbfs.fastpath_rx_errors);
+ debugfs_remove(vnic->netdev_dbfs.fastpath_tx_pkts);
+ debugfs_remove(vnic->netdev_dbfs.fastpath_tx_bytes);
+ debugfs_remove(vnic->netdev_dbfs.fastpath_tx_errors);
+
+#if NETFRONT_ACCEL_STATS
+ debugfs_remove(vnic->dbfs.irq_count);
+ debugfs_remove(vnic->dbfs.useless_irq_count);
+ debugfs_remove(vnic->dbfs.poll_schedule_count);
+ debugfs_remove(vnic->dbfs.poll_call_count);
+ debugfs_remove(vnic->dbfs.poll_reschedule_count);
+ debugfs_remove(vnic->dbfs.queue_stops);
+ debugfs_remove(vnic->dbfs.queue_wakes);
+ debugfs_remove(vnic->dbfs.ssr_bursts);
+ debugfs_remove(vnic->dbfs.ssr_drop_stream);
+ debugfs_remove(vnic->dbfs.ssr_misorder);
+ debugfs_remove(vnic->dbfs.ssr_slow_start);
+ debugfs_remove(vnic->dbfs.ssr_merges);
+ debugfs_remove(vnic->dbfs.ssr_too_many);
+ debugfs_remove(vnic->dbfs.ssr_new_stream);
+
+ debugfs_remove(vnic->dbfs.fastpath_tx_busy);
+ debugfs_remove(vnic->dbfs.fastpath_tx_completions);
+ debugfs_remove(vnic->dbfs.fastpath_tx_pending_max);
+ debugfs_remove(vnic->dbfs.event_count);
+ debugfs_remove(vnic->dbfs.bad_event_count);
+ debugfs_remove(vnic->dbfs.event_count_since_irq);
+ debugfs_remove(vnic->dbfs.events_per_irq_max);
+ debugfs_remove(vnic->dbfs.fastpath_frm_trunc);
+ debugfs_remove(vnic->dbfs.rx_no_desc_trunc);
+ debugfs_remove(vnic->dbfs.events_per_poll_max);
+ debugfs_remove(vnic->dbfs.events_per_poll_rx_max);
+ debugfs_remove(vnic->dbfs.events_per_poll_tx_max);
+#endif
+ debugfs_remove(vnic->dbfs_dir);
+ }
+#endif
+ return 0;
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+
+#include <xen/xenbus.h>
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+
+#include "netfront.h" /* drivers/xen/netfront/netfront.h */
+
+static void vnic_start_interrupts(netfront_accel_vnic *vnic)
+{
+ unsigned long flags;
+
+ /* Prime our interrupt */
+ spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+ if (!netfront_accel_vi_enable_interrupts(vnic)) {
+ struct netfront_info *np = netdev_priv(vnic->net_dev);
+
+ /* Cripes, that was quick, better pass it up */
+ netfront_accel_disable_net_interrupts(vnic);
+ vnic->irq_enabled = 0;
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
+ netif_rx_schedule(vnic->net_dev, &np->napi);
+ } else {
+ /*
+ * Nothing yet, make sure we get interrupts through
+ * back end
+ */
+ vnic->irq_enabled = 1;
+ netfront_accel_enable_net_interrupts(vnic);
+ }
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static void vnic_stop_interrupts(netfront_accel_vnic *vnic)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+ netfront_accel_disable_net_interrupts(vnic);
+ vnic->irq_enabled = 0;
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static void vnic_start_fastpath(netfront_accel_vnic *vnic)
+{
+ struct net_device *net_dev = vnic->net_dev;
+ struct netfront_info *np = netdev_priv(net_dev);
+ unsigned long flags;
+
+ DPRINTK("%s\n", __FUNCTION__);
+
+ spin_lock_irqsave(&vnic->tx_lock, flags);
+ vnic->tx_enabled = 1;
+ spin_unlock_irqrestore(&vnic->tx_lock, flags);
+
+ napi_disable(&np->napi);
+ vnic->poll_enabled = 1;
+ napi_enable(&np->napi);
+
+ vnic_start_interrupts(vnic);
+}
+
+
+void vnic_stop_fastpath(netfront_accel_vnic *vnic)
+{
+ struct net_device *net_dev = vnic->net_dev;
+ struct netfront_info *np = (struct netfront_info *)netdev_priv(net_dev);
+ unsigned long flags1, flags2;
+
+ DPRINTK("%s\n", __FUNCTION__);
+
+ vnic_stop_interrupts(vnic);
+
+ spin_lock_irqsave(&vnic->tx_lock, flags1);
+ vnic->tx_enabled = 0;
+ spin_lock_irqsave(&np->tx_lock, flags2);
+ if (vnic->tx_skb != NULL) {
+ dev_kfree_skb_any(vnic->tx_skb);
+ vnic->tx_skb = NULL;
+ if (netfront_check_queue_ready(net_dev)) {
+ netif_wake_queue(net_dev);
+ NETFRONT_ACCEL_STATS_OP
+ (vnic->stats.queue_wakes++);
+ }
+ }
+ spin_unlock_irqrestore(&np->tx_lock, flags2);
+ spin_unlock_irqrestore(&vnic->tx_lock, flags1);
+
+ /* Must prevent polls and hold lock to modify poll_enabled */
+ napi_disable(&np->napi);
+ spin_lock_irqsave(&vnic->irq_enabled_lock, flags1);
+ vnic->poll_enabled = 0;
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags1);
+ napi_enable(&np->napi);
+}
+
+
+static void netfront_accel_interface_up(netfront_accel_vnic *vnic)
+{
+
+ if (!vnic->backend_netdev_up) {
+ vnic->backend_netdev_up = 1;
+
+ if (vnic->frontend_ready)
+ vnic_start_fastpath(vnic);
+ }
+}
+
+
+static void netfront_accel_interface_down(netfront_accel_vnic *vnic)
+{
+
+ if (vnic->backend_netdev_up) {
+ vnic->backend_netdev_up = 0;
+
+ if (vnic->frontend_ready)
+ vnic_stop_fastpath(vnic);
+ }
+}
+
+
+static int vnic_add_bufs(netfront_accel_vnic *vnic,
+ struct net_accel_msg *msg)
+{
+ int rc, offset;
+ struct netfront_accel_bufinfo *bufinfo;
+
+ BUG_ON(msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
+
+ offset = msg->u.mapbufs.reqid;
+
+ if (offset < vnic->bufpages.max_pages -
+ (vnic->bufpages.max_pages / sfc_netfront_buffer_split)) {
+ bufinfo = vnic->rx_bufs;
+ } else
+ bufinfo = vnic->tx_bufs;
+
+ /* Queue up some Rx buffers to start things off. */
+ if ((rc = netfront_accel_add_bufs(&vnic->bufpages, bufinfo, msg)) == 0) {
+ netfront_accel_vi_add_bufs(vnic, bufinfo == vnic->rx_bufs);
+
+ if (offset + msg->u.mapbufs.pages == vnic->bufpages.max_pages) {
+ VPRINTK("%s: got all buffers back\n", __FUNCTION__);
+ vnic->frontend_ready = 1;
+ if (vnic->backend_netdev_up)
+ vnic_start_fastpath(vnic);
+ } else {
+ VPRINTK("%s: got buffers back %d %d\n", __FUNCTION__,
+ offset, msg->u.mapbufs.pages);
+ }
+ }
+
+ return rc;
+}
+
+
+/* The largest [o] such that (1u << o) <= n. Requires n > 0. */
+
+inline unsigned log2_le(unsigned long n) {
+ unsigned order = 1;
+ while ((1ul << order) <= n) ++order;
+ return (order - 1);
+}
+
+static int vnic_send_buffer_requests(netfront_accel_vnic *vnic,
+ struct netfront_accel_bufpages *bufpages)
+{
+ int pages, offset, rc = 0, sent = 0;
+ struct net_accel_msg msg;
+
+ while (bufpages->page_reqs < bufpages->max_pages) {
+ offset = bufpages->page_reqs;
+
+ pages = pow2(log2_le(bufpages->max_pages -
+ bufpages->page_reqs));
+ pages = pages < NET_ACCEL_MSG_MAX_PAGE_REQ ?
+ pages : NET_ACCEL_MSG_MAX_PAGE_REQ;
+
+ BUG_ON(offset < 0);
+ BUG_ON(pages <= 0);
+
+ rc = netfront_accel_buf_map_request(vnic->dev, bufpages,
+ &msg, pages, offset);
+ if (rc == 0) {
+ rc = net_accel_msg_send(vnic->shared_page,
+ &vnic->to_dom0, &msg);
+ if (rc < 0) {
+ VPRINTK("%s: queue full, stopping for now\n",
+ __FUNCTION__);
+ break;
+ }
+ sent++;
+ } else {
+ EPRINTK("%s: problem with grant, stopping for now\n",
+ __FUNCTION__);
+ break;
+ }
+
+ bufpages->page_reqs += pages;
+ }
+
+ if (sent)
+ net_accel_msg_notify(vnic->msg_channel_irq);
+
+ return rc;
+}
+
+
+/*
+ * In response to dom0 saying "my queue is full", we reply with this
+ * when it is no longer full
+ */
+inline void vnic_set_queue_not_full(netfront_accel_vnic *vnic)
+{
+
+ if (test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B,
+ (unsigned long *)&vnic->shared_page->aflags))
+ notify_remote_via_irq(vnic->msg_channel_irq);
+ else
+ VPRINTK("queue not full bit already set, not signalling\n");
+}
+
+/*
+ * Notify dom0 that the queue we want to use is full, it should
+ * respond by setting MSG_AFLAGS_QUEUEUNOTFULL in due course
+ */
+inline void vnic_set_queue_full(netfront_accel_vnic *vnic)
+{
+
+ if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B,
+ (unsigned long *)&vnic->shared_page->aflags))
+ notify_remote_via_irq(vnic->msg_channel_irq);
+ else
+ VPRINTK("queue full bit already set, not signalling\n");
+}
+
+
+static int vnic_check_hello_version(unsigned version)
+{
+ if (version > NET_ACCEL_MSG_VERSION) {
+ /* Newer protocol, we must refuse */
+ return -EPROTO;
+ }
+
+ if (version < NET_ACCEL_MSG_VERSION) {
+ /*
+ * We are newer, so have discretion to accept if we
+ * wish. For now however, just reject
+ */
+ return -EPROTO;
+ }
+
+ BUG_ON(version != NET_ACCEL_MSG_VERSION);
+ return 0;
+}
+
+
+static int vnic_process_hello_msg(netfront_accel_vnic *vnic,
+ struct net_accel_msg *msg)
+{
+ int err = 0;
+ unsigned pages = sfc_netfront_max_pages;
+
+ if (vnic_check_hello_version(msg->u.hello.version) < 0) {
+ msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY
+ | NET_ACCEL_MSG_ERROR;
+ msg->u.hello.version = NET_ACCEL_MSG_VERSION;
+ } else {
+ vnic->backend_netdev_up
+ = vnic->shared_page->net_dev_up;
+
+ msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY;
+ msg->u.hello.version = NET_ACCEL_MSG_VERSION;
+ if (msg->u.hello.max_pages &&
+ msg->u.hello.max_pages < pages)
+ pages = msg->u.hello.max_pages;
+ msg->u.hello.max_pages = pages;
+
+ /* Half of pages for rx, half for tx */
+ err = netfront_accel_alloc_buffer_mem(&vnic->bufpages,
+ vnic->rx_bufs,
+ vnic->tx_bufs,
+ pages);
+ if (err)
+ msg->id |= NET_ACCEL_MSG_ERROR;
+ }
+
+ /* Send reply */
+ net_accel_msg_reply_notify(vnic->shared_page, vnic->msg_channel_irq,
+ &vnic->to_dom0, msg);
+ return err;
+}
+
+
+static int vnic_process_localmac_msg(netfront_accel_vnic *vnic,
+ struct net_accel_msg *msg)
+{
+ unsigned long flags;
+ cuckoo_hash_mac_key key;
+
+ if (msg->u.localmac.flags & NET_ACCEL_MSG_ADD) {
+ DECLARE_MAC_BUF(buf);
+
+ DPRINTK("MAC has moved, could be local: %s\n",
+ print_mac(buf, msg->u.localmac.mac));
+ key = cuckoo_mac_to_key(msg->u.localmac.mac);
+ spin_lock_irqsave(&vnic->table_lock, flags);
+ /* Try to remove it, not a big deal if not there */
+ cuckoo_hash_remove(&vnic->fastpath_table,
+ (cuckoo_hash_key *)&key);
+ spin_unlock_irqrestore(&vnic->table_lock, flags);
+ }
+
+ return 0;
+}
+
+
+static
+int vnic_process_rx_msg(netfront_accel_vnic *vnic,
+ struct net_accel_msg *msg)
+{
+ int err;
+
+ switch (msg->id) {
+ case NET_ACCEL_MSG_HELLO:
+ /* Hello, reply with Reply */
+ DPRINTK("got Hello, with version %.8x\n",
+ msg->u.hello.version);
+ BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_NONE);
+ err = vnic_process_hello_msg(vnic, msg);
+ if (err == 0)
+ vnic->msg_state = NETFRONT_ACCEL_MSG_HELLO;
+ break;
+ case NET_ACCEL_MSG_SETHW:
+ /* Hardware info message */
+ DPRINTK("got H/W info\n");
+ BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HELLO);
+ err = netfront_accel_vi_init(vnic, &msg->u.hw);
+ if (err == 0)
+ vnic->msg_state = NETFRONT_ACCEL_MSG_HW;
+ break;
+ case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY:
+ VPRINTK("Got mapped buffers back\n");
+ BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+ err = vnic_add_bufs(vnic, msg);
+ break;
+ case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_ERROR:
+ /* No buffers. Can't use the fast path. */
+ EPRINTK("Got mapped buffers error. Cannot accelerate.\n");
+ BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+ err = -EIO;
+ break;
+ case NET_ACCEL_MSG_LOCALMAC:
+ /* Should be add, remove not currently used */
+ EPRINTK_ON(!(msg->u.localmac.flags & NET_ACCEL_MSG_ADD));
+ BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+ err = vnic_process_localmac_msg(vnic, msg);
+ break;
+ default:
+ EPRINTK("Huh? Message code is 0x%x\n", msg->id);
+ err = -EPROTO;
+ break;
+ }
+
+ return err;
+}
+
+
+/* Process an IRQ received from back end driver */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+void netfront_accel_msg_from_bend(struct work_struct *context)
+#else
+void netfront_accel_msg_from_bend(void *context)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+ netfront_accel_vnic *vnic =
+ container_of(context, netfront_accel_vnic, msg_from_bend);
+#else
+ netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+#endif
+ struct net_accel_msg msg;
+ int err, queue_was_full = 0;
+
+ mutex_lock(&vnic->vnic_mutex);
+
+ /*
+ * This happens when the shared pages have been unmapped but
+ * the workqueue has yet to be flushed
+ */
+ if (!vnic->dom0_state_is_setup)
+ goto unlock_out;
+
+ while ((vnic->shared_page->aflags & NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK)
+ != 0) {
+ if (vnic->shared_page->aflags &
+ NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL) {
+ /* We've been told there may now be space. */
+ clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B,
+ (unsigned long *)&vnic->shared_page->aflags);
+ }
+
+ if (vnic->shared_page->aflags &
+ NET_ACCEL_MSG_AFLAGS_QUEUE0FULL) {
+ /*
+ * There will be space at the end of this
+ * function if we can make any.
+ */
+ clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
+ (unsigned long *)&vnic->shared_page->aflags);
+ queue_was_full = 1;
+ }
+
+ if (vnic->shared_page->aflags &
+ NET_ACCEL_MSG_AFLAGS_NETUPDOWN) {
+ DPRINTK("%s: net interface change\n", __FUNCTION__);
+ clear_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B,
+ (unsigned long *)&vnic->shared_page->aflags);
+ if (vnic->shared_page->net_dev_up)
+ netfront_accel_interface_up(vnic);
+ else
+ netfront_accel_interface_down(vnic);
+ }
+ }
+
+ /* Pull msg out of shared memory */
+ while ((err = net_accel_msg_recv(vnic->shared_page, &vnic->from_dom0,
+ &msg)) == 0) {
+ err = vnic_process_rx_msg(vnic, &msg);
+
+ if (err != 0)
+ goto done;
+ }
+
+ /*
+ * Send any pending buffer map request messages that we can,
+ * and mark domU->dom0 as full if necessary.
+ */
+ if (vnic->msg_state == NETFRONT_ACCEL_MSG_HW &&
+ vnic->bufpages.page_reqs < vnic->bufpages.max_pages) {
+ if (vnic_send_buffer_requests(vnic, &vnic->bufpages) == -ENOSPC)
+ vnic_set_queue_full(vnic);
+ }
+
+ /*
+ * If there are no messages then this is not an error. It
+ * just means that we've finished processing the queue.
+ */
+ if (err == -ENOENT)
+ err = 0;
+ done:
+ /* We will now have made space in the dom0->domU queue if we can */
+ if (queue_was_full)
+ vnic_set_queue_not_full(vnic);
+
+ if (err != 0) {
+ EPRINTK("%s returned %d\n", __FUNCTION__, err);
+ netfront_accel_set_closing(vnic);
+ }
+
+ unlock_out:
+ mutex_unlock(&vnic->vnic_mutex);
+
+ return;
+}
+
+
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
+{
+ netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+ VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
+
+ queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
+
+ return IRQ_HANDLED;
+}
+
+/* Process an interrupt received from the NIC via backend */
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
+{
+ netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+ struct net_device *net_dev = vnic->net_dev;
+ unsigned long flags;
+
+ VPRINTK("net irq %d from device %s\n", irq, vnic->dev->nodename);
+
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.irq_count++);
+
+ BUG_ON(net_dev==NULL);
+
+ spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+ if (vnic->irq_enabled) {
+ struct netfront_info *np = netdev_priv(net_dev);
+
+ netfront_accel_disable_net_interrupts(vnic);
+ vnic->irq_enabled = 0;
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+
+#if NETFRONT_ACCEL_STATS
+ vnic->stats.poll_schedule_count++;
+ if (vnic->stats.event_count_since_irq >
+ vnic->stats.events_per_irq_max)
+ vnic->stats.events_per_irq_max =
+ vnic->stats.event_count_since_irq;
+ vnic->stats.event_count_since_irq = 0;
+#endif
+ netif_rx_schedule(net_dev, &np->napi);
+ }
+ else {
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.useless_irq_count++);
+ DPRINTK("%s: irq when disabled\n", __FUNCTION__);
+ }
+
+ return IRQ_HANDLED;
+}
+
+
+void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
+ u32 ip, u16 port, u8 protocol)
+{
+ unsigned long lock_state;
+ struct net_accel_msg *msg;
+
+ msg = net_accel_msg_start_send(vnic->shared_page, &vnic->to_dom0,
+ &lock_state);
+
+ if (msg == NULL)
+ return;
+
+ net_accel_msg_init(msg, NET_ACCEL_MSG_FASTPATH);
+ msg->u.fastpath.flags = NET_ACCEL_MSG_REMOVE;
+ memcpy(msg->u.fastpath.mac, mac, ETH_ALEN);
+
+ msg->u.fastpath.port = port;
+ msg->u.fastpath.ip = ip;
+ msg->u.fastpath.proto = protocol;
+
+ net_accel_msg_complete_send_notify(vnic->shared_page, &vnic->to_dom0,
+ &lock_state, vnic->msg_channel_irq);
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+/* drivers/xen/netfront/netfront.h */
+#include "netfront.h"
+
+#include "accel.h"
+#include "accel_bufs.h"
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+#include "accel_ssr.h"
+
+#ifdef EFX_GCOV
+#include "gcov.h"
+#endif
+
+#define NETFRONT_ACCEL_VNIC_FROM_NETDEV(_nd) \
+ ((netfront_accel_vnic *)((struct netfront_info *)netdev_priv(net_dev))->accel_priv)
+
+static int netfront_accel_netdev_start_xmit(struct sk_buff *skb,
+ struct net_device *net_dev)
+{
+ netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+ struct netfront_info *np =
+ (struct netfront_info *)netdev_priv(net_dev);
+ int handled, rc;
+ unsigned long flags1, flags2;
+
+ BUG_ON(vnic == NULL);
+
+ /* Take our tx lock and hold for the duration */
+ spin_lock_irqsave(&vnic->tx_lock, flags1);
+
+ if (!vnic->tx_enabled) {
+ rc = 0;
+ goto unlock_out;
+ }
+
+ handled = netfront_accel_vi_tx_post(vnic, skb);
+ if (handled == NETFRONT_ACCEL_STATUS_BUSY) {
+ BUG_ON(vnic->net_dev != net_dev);
+ DPRINTK("%s stopping queue\n", __FUNCTION__);
+
+ /* Netfront's lock protects tx_skb */
+ spin_lock_irqsave(&np->tx_lock, flags2);
+ BUG_ON(vnic->tx_skb != NULL);
+ vnic->tx_skb = skb;
+ netif_stop_queue(net_dev);
+ spin_unlock_irqrestore(&np->tx_lock, flags2);
+
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.queue_stops++);
+ }
+
+ if (handled == NETFRONT_ACCEL_STATUS_CANT)
+ rc = 0;
+ else
+ rc = 1;
+
+unlock_out:
+ spin_unlock_irqrestore(&vnic->tx_lock, flags1);
+
+ return rc;
+}
+
+
+static int netfront_accel_netdev_poll(struct net_device *net_dev, int *budget)
+{
+ netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+ int rx_allowed = *budget, rx_done;
+
+ BUG_ON(vnic == NULL);
+
+ /* Can check this without lock as modifier excludes polls */
+ if (!vnic->poll_enabled)
+ return 0;
+
+ rx_done = netfront_accel_vi_poll(vnic, rx_allowed);
+ *budget -= rx_done;
+
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_call_count++);
+
+ VPRINTK("%s: done %d allowed %d\n",
+ __FUNCTION__, rx_done, rx_allowed);
+
+ netfront_accel_ssr_end_of_burst(vnic, &vnic->ssr_state);
+
+ if (rx_done < rx_allowed) {
+ return 0; /* Done */
+ }
+
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_reschedule_count++);
+
+ return 1; /* More to do. */
+}
+
+
+/*
+ * Process request from netfront to start napi interrupt
+ * mode. (i.e. enable interrupts as it's finished polling)
+ */
+static int netfront_accel_start_napi_interrupts(struct net_device *net_dev)
+{
+ netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+ unsigned long flags;
+
+ BUG_ON(vnic == NULL);
+
+ /*
+ * Can check this without lock as writer excludes poll before
+ * modifying
+ */
+ if (!vnic->poll_enabled)
+ return 0;
+
+ if (!netfront_accel_vi_enable_interrupts(vnic)) {
+ /*
+ * There was something there, tell caller we had
+ * something to do.
+ */
+ return 1;
+ }
+
+ spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+ vnic->irq_enabled = 1;
+ netfront_accel_enable_net_interrupts(vnic);
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+
+ return 0;
+}
+
+
+/*
+ * Process request from netfront to stop napi interrupt
+ * mode. (i.e. disable interrupts as it's starting to poll
+ */
+static void netfront_accel_stop_napi_interrupts(struct net_device *net_dev)
+{
+ netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+ unsigned long flags;
+
+ BUG_ON(vnic == NULL);
+
+ spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+
+ if (!vnic->poll_enabled) {
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+ return;
+ }
+
+ netfront_accel_disable_net_interrupts(vnic);
+ vnic->irq_enabled = 0;
+ spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static int netfront_accel_check_ready(struct net_device *net_dev)
+{
+ netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+
+ BUG_ON(vnic == NULL);
+
+ /* This is protected by netfront's lock */
+ return vnic->tx_skb == NULL;
+}
+
+
+static int netfront_accel_get_stats(struct net_device *net_dev,
+ struct net_device_stats *stats)
+{
+ netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+ struct netfront_accel_netdev_stats now;
+
+ BUG_ON(vnic == NULL);
+
+ now.fastpath_rx_pkts = vnic->netdev_stats.fastpath_rx_pkts;
+ now.fastpath_rx_bytes = vnic->netdev_stats.fastpath_rx_bytes;
+ now.fastpath_rx_errors = vnic->netdev_stats.fastpath_rx_errors;
+ now.fastpath_tx_pkts = vnic->netdev_stats.fastpath_tx_pkts;
+ now.fastpath_tx_bytes = vnic->netdev_stats.fastpath_tx_bytes;
+ now.fastpath_tx_errors = vnic->netdev_stats.fastpath_tx_errors;
+
+ stats->rx_packets += (now.fastpath_rx_pkts -
+ vnic->stats_last_read.fastpath_rx_pkts);
+ stats->rx_bytes += (now.fastpath_rx_bytes -
+ vnic->stats_last_read.fastpath_rx_bytes);
+ stats->rx_errors += (now.fastpath_rx_errors -
+ vnic->stats_last_read.fastpath_rx_errors);
+ stats->tx_packets += (now.fastpath_tx_pkts -
+ vnic->stats_last_read.fastpath_tx_pkts);
+ stats->tx_bytes += (now.fastpath_tx_bytes -
+ vnic->stats_last_read.fastpath_tx_bytes);
+ stats->tx_errors += (now.fastpath_tx_errors -
+ vnic->stats_last_read.fastpath_tx_errors);
+
+ vnic->stats_last_read = now;
+
+ return 0;
+}
+
+
+struct netfront_accel_hooks accel_hooks = {
+ .new_device = &netfront_accel_probe,
+ .remove = &netfront_accel_remove,
+ .netdev_poll = &netfront_accel_netdev_poll,
+ .start_xmit = &netfront_accel_netdev_start_xmit,
+ .start_napi_irq = &netfront_accel_start_napi_interrupts,
+ .stop_napi_irq = &netfront_accel_stop_napi_interrupts,
+ .check_ready = &netfront_accel_check_ready,
+ .get_stats = &netfront_accel_get_stats
+};
+
+
+unsigned sfc_netfront_max_pages = NETFRONT_ACCEL_DEFAULT_BUF_PAGES;
+module_param_named (max_pages, sfc_netfront_max_pages, uint, 0644);
+MODULE_PARM_DESC(max_pages, "Number of buffer pages to request");
+
+unsigned sfc_netfront_buffer_split = 2;
+module_param_named (buffer_split, sfc_netfront_buffer_split, uint, 0644);
+MODULE_PARM_DESC(buffer_split,
+ "Fraction of buffers to use for TX, rest for RX");
+
+
+const char *frontend_name = "sfc_netfront";
+
+struct workqueue_struct *netfront_accel_workqueue;
+
+static int __init netfront_accel_init(void)
+{
+ int rc;
+#ifdef EFX_GCOV
+ gcov_provider_init(THIS_MODULE);
+#endif
+
+ /*
+ * If we're running on dom0, netfront hasn't initialised
+ * itself, so we need to keep away
+ */
+ if (is_initial_xendomain())
+ return 0;
+
+ if (!is_pow2(sizeof(struct net_accel_msg)))
+ EPRINTK("%s: bad structure size\n", __FUNCTION__);
+
+ netfront_accel_workqueue = create_workqueue(frontend_name);
+
+ netfront_accel_debugfs_init();
+
+ rc = netfront_accelerator_loaded(NETFRONT_ACCEL_VERSION,
+ frontend_name, &accel_hooks);
+
+ if (rc < 0) {
+ EPRINTK("Xen netfront accelerator version mismatch\n");
+ return -EINVAL;
+ }
+
+ if (rc > 0) {
+ /*
+ * In future may want to add backwards compatibility
+ * and accept certain subsets of previous versions
+ */
+ EPRINTK("Xen netfront accelerator version mismatch\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+module_init(netfront_accel_init);
+
+static void __exit netfront_accel_exit(void)
+{
+ if (is_initial_xendomain())
+ return;
+
+ DPRINTK("%s: unhooking\n", __FUNCTION__);
+
+ /* Unhook from normal netfront */
+ netfront_accelerator_stop(frontend_name);
+
+ DPRINTK("%s: done\n", __FUNCTION__);
+
+ netfront_accel_debugfs_fini();
+
+ flush_workqueue(netfront_accel_workqueue);
+
+ destroy_workqueue(netfront_accel_workqueue);
+
+#ifdef EFX_GCOV
+ gcov_provider_fini(THIS_MODULE);
+#endif
+ return;
+}
+module_exit(netfront_accel_exit);
+
+MODULE_LICENSE("GPL");
+
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+
+#include "accel_ssr.h"
+
+static inline int list_valid(struct list_head *lh) {
+ return(lh->next != NULL);
+}
+
+static void netfront_accel_ssr_deliver (struct netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st,
+ struct netfront_accel_ssr_conn *c);
+
+/** Construct an efx_ssr_state.
+ *
+ * @v st The SSR state (per channel per port)
+ * @v port The port.
+ */
+void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st) {
+ unsigned i;
+
+ INIT_LIST_HEAD(&st->conns);
+ INIT_LIST_HEAD(&st->free_conns);
+ for (i = 0; i < 8; ++i) {
+ struct netfront_accel_ssr_conn *c =
+ kmalloc(sizeof(*c), GFP_KERNEL);
+ if (c == NULL) break;
+ c->n_in_order_pkts = 0;
+ c->skb = NULL;
+ list_add(&c->link, &st->free_conns);
+ }
+
+}
+
+
+/** Destructor for an efx_ssr_state.
+ *
+ * @v st The SSR state (per channel per port)
+ */
+void netfront_accel_ssr_fini(netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st) {
+ struct netfront_accel_ssr_conn *c;
+
+ /* Return cleanly if efx_ssr_init() not previously called */
+ BUG_ON(list_valid(&st->conns) != list_valid(&st->free_conns));
+ if (! list_valid(&st->conns))
+ return;
+
+ while ( ! list_empty(&st->free_conns)) {
+ c = list_entry(st->free_conns.prev,
+ struct netfront_accel_ssr_conn, link);
+ list_del(&c->link);
+ BUG_ON(c->skb != NULL);
+ kfree(c);
+ }
+ while ( ! list_empty(&st->conns)) {
+ c = list_entry(st->conns.prev,
+ struct netfront_accel_ssr_conn, link);
+ list_del(&c->link);
+ if (c->skb)
+ netfront_accel_ssr_deliver(vnic, st, c);
+ kfree(c);
+ }
+}
+
+
+/** Calc IP checksum and deliver to the OS
+ *
+ * @v st The SSR state (per channel per port)
+ * @v c The SSR connection state
+ */
+static void netfront_accel_ssr_deliver(netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st,
+ struct netfront_accel_ssr_conn *c) {
+ BUG_ON(c->skb == NULL);
+
+ /*
+ * If we've chained packets together, recalculate the IP
+ * checksum.
+ */
+ if (skb_shinfo(c->skb)->frag_list) {
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_bursts);
+ c->iph->check = 0;
+ c->iph->check = ip_fast_csum((unsigned char *) c->iph,
+ c->iph->ihl);
+ }
+
+ VPRINTK("%s: %d\n", __FUNCTION__, c->skb->len);
+
+ netif_receive_skb(c->skb);
+ c->skb = NULL;
+}
+
+
+/** Push held skbs down into network stack.
+ *
+ * @v st SSR state
+ *
+ * Only called if we are tracking one or more connections.
+ */
+void __netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st) {
+ struct netfront_accel_ssr_conn *c;
+
+ BUG_ON(list_empty(&st->conns));
+
+ list_for_each_entry(c, &st->conns, link)
+ if (c->skb)
+ netfront_accel_ssr_deliver(vnic, st, c);
+
+ /* Time-out connections that have received no traffic for 20ms. */
+ c = list_entry(st->conns.prev, struct netfront_accel_ssr_conn,
+ link);
+ if (jiffies - c->last_pkt_jiffies > (HZ / 50 + 1)) {
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_drop_stream);
+ list_del(&c->link);
+ list_add(&c->link, &st->free_conns);
+ }
+}
+
+
+/** Process SKB and decide whether to dispatch it to the stack now or
+ * later.
+ *
+ * @v st SSR state
+ * @v skb SKB to exmaine
+ * @ret rc 0 => deliver SKB to kernel now, otherwise the SKB belongs
+ * us.
+ */
+int netfront_accel_ssr_skb(struct netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st,
+ struct sk_buff *skb) {
+ int data_length, dont_merge;
+ struct netfront_accel_ssr_conn *c;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ unsigned th_seq;
+
+ BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+ BUG_ON(skb->next != NULL);
+
+ /* We're not interested if it isn't TCP over IPv4. */
+ iph = (struct iphdr *) skb->data;
+ if (skb->protocol != htons(ETH_P_IP) ||
+ iph->protocol != IPPROTO_TCP) {
+ return 0;
+ }
+
+ /* Ignore segments that fail csum or are fragmented. */
+ if (unlikely((skb->ip_summed - CHECKSUM_UNNECESSARY) |
+ (iph->frag_off & htons(IP_MF | IP_OFFSET)))) {
+ return 0;
+ }
+
+ th = (struct tcphdr*)(skb->data + iph->ihl * 4);
+ data_length = ntohs(iph->tot_len) - iph->ihl * 4 - th->doff * 4;
+ th_seq = ntohl(th->seq);
+ dont_merge = (data_length == 0) | th->urg | th->syn | th->rst;
+
+ list_for_each_entry(c, &st->conns, link) {
+ if ((c->saddr - iph->saddr) |
+ (c->daddr - iph->daddr) |
+ (c->source - th->source) |
+ (c->dest - th->dest ))
+ continue;
+
+ /* Re-insert at head of list to reduce lookup time. */
+ list_del(&c->link);
+ list_add(&c->link, &st->conns);
+ c->last_pkt_jiffies = jiffies;
+
+ if (unlikely(th_seq - c->next_seq)) {
+ /* Out-of-order, so start counting again. */
+ if (c->skb)
+ netfront_accel_ssr_deliver(vnic, st, c);
+ c->n_in_order_pkts = 0;
+ c->next_seq = th_seq + data_length;
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_misorder);
+ return 0;
+ }
+ c->next_seq = th_seq + data_length;
+
+ if (++c->n_in_order_pkts < 300) {
+ /* May be in slow-start, so don't merge. */
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_slow_start);
+ return 0;
+ }
+
+ if (unlikely(dont_merge)) {
+ if (c->skb)
+ netfront_accel_ssr_deliver(vnic, st, c);
+ return 0;
+ }
+
+ if (c->skb) {
+ c->iph->tot_len = ntohs(c->iph->tot_len);
+ c->iph->tot_len += data_length;
+ c->iph->tot_len = htons(c->iph->tot_len);
+ c->th->ack_seq = th->ack_seq;
+ c->th->fin |= th->fin;
+ c->th->psh |= th->psh;
+ c->th->window = th->window;
+
+ /* Remove the headers from this skb. */
+ skb_pull(skb, skb->len - data_length);
+
+ /*
+ * Tack the new skb onto the head skb's frag_list.
+ * This is exactly the format that fragmented IP
+ * datagrams are reassembled into.
+ */
+ BUG_ON(skb->next != 0);
+ if ( ! skb_shinfo(c->skb)->frag_list)
+ skb_shinfo(c->skb)->frag_list = skb;
+ else
+ c->skb_tail->next = skb;
+ c->skb_tail = skb;
+ c->skb->len += skb->len;
+ c->skb->data_len += skb->len;
+ c->skb->truesize += skb->truesize;
+
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_merges);
+
+ /*
+ * If the next packet might push this super-packet
+ * over the limit for an IP packet, deliver it now.
+ * This is slightly conservative, but close enough.
+ */
+ if (c->skb->len +
+ (PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)
+ > 16384)
+ netfront_accel_ssr_deliver(vnic, st, c);
+
+ return 1;
+ }
+ else {
+ c->iph = iph;
+ c->th = th;
+ c->skb = skb;
+ return 1;
+ }
+ }
+
+ /* We're not yet tracking this connection. */
+
+ if (dont_merge) {
+ return 0;
+ }
+
+ if (list_empty(&st->free_conns)) {
+ c = list_entry(st->conns.prev,
+ struct netfront_accel_ssr_conn,
+ link);
+ if (c->skb) {
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_too_many);
+ return 0;
+ }
+ }
+ else {
+ c = list_entry(st->free_conns.next,
+ struct netfront_accel_ssr_conn,
+ link);
+ }
+ list_del(&c->link);
+ list_add(&c->link, &st->conns);
+ c->saddr = iph->saddr;
+ c->daddr = iph->daddr;
+ c->source = th->source;
+ c->dest = th->dest;
+ c->next_seq = th_seq + data_length;
+ c->n_in_order_pkts = 0;
+ BUG_ON(c->skb != NULL);
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_new_stream);
+ return 0;
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_SSR_H
+#define NETFRONT_ACCEL_SSR_H
+
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+
+#include "accel.h"
+
+/** State for Soft Segment Reassembly (SSR). */
+
+struct netfront_accel_ssr_conn {
+ struct list_head link;
+
+ unsigned saddr, daddr;
+ unsigned short source, dest;
+
+ /** Number of in-order packets we've seen with payload. */
+ unsigned n_in_order_pkts;
+
+ /** Next in-order sequence number. */
+ unsigned next_seq;
+
+ /** Time we last saw a packet on this connection. */
+ unsigned long last_pkt_jiffies;
+
+ /** The SKB we are currently holding. If NULL, then all following
+ * fields are undefined.
+ */
+ struct sk_buff *skb;
+
+ /** The tail of the frag_list of SKBs we're holding. Only valid
+ * after at least one merge.
+ */
+ struct sk_buff *skb_tail;
+
+ /** The IP header of the skb we are holding. */
+ struct iphdr *iph;
+
+ /** The TCP header of the skb we are holding. */
+ struct tcphdr *th;
+};
+
+extern void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st);
+extern void netfront_accel_ssr_fini(netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st);
+
+extern void
+__netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st);
+
+extern int netfront_accel_ssr_skb(netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st,
+ struct sk_buff *skb);
+
+static inline void
+netfront_accel_ssr_end_of_burst (netfront_accel_vnic *vnic,
+ struct netfront_accel_ssr_state *st) {
+ if ( ! list_empty(&st->conns) )
+ __netfront_accel_ssr_end_of_burst(vnic, st);
+}
+
+#endif /* NETFRONT_ACCEL_SSR_H */
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/pci.h>
+#include <linux/tcp.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+
+#include "accel.h"
+#include "accel_util.h"
+
+#include "accel_tso.h"
+
+#define ETH_HDR_LEN(skb) skb_network_offset(skb)
+#define SKB_TCP_OFF(skb) skb_transport_offset(skb)
+#define SKB_IP_OFF(skb) skb_network_offset(skb)
+
+/*
+ * Set a maximum number of buffers in each output packet to make life
+ * a little simpler - if this is reached it will just move on to
+ * another packet
+ */
+#define ACCEL_TSO_MAX_BUFFERS (6)
+
+/** TSO State.
+ *
+ * The state used during segmentation. It is put into this data structure
+ * just to make it easy to pass into inline functions.
+ */
+struct netfront_accel_tso_state {
+ /** bytes of data we've yet to segment */
+ unsigned remaining_len;
+
+ /** current sequence number */
+ unsigned seqnum;
+
+ /** remaining space in current packet */
+ unsigned packet_space;
+
+ /** List of packets to be output, containing the buffers and
+ * iovecs to describe each packet
+ */
+ struct netfront_accel_tso_output_packet *output_packets;
+
+ /** Total number of buffers in output_packets */
+ unsigned buffers;
+
+ /** Total number of packets in output_packets */
+ unsigned packets;
+
+ /** Input Fragment Cursor.
+ *
+ * Where we are in the current fragment of the incoming SKB. These
+ * values get updated in place when we split a fragment over
+ * multiple packets.
+ */
+ struct {
+ /** address of current position */
+ void *addr;
+ /** remaining length */
+ unsigned int len;
+ } ifc; /* == ifc Input Fragment Cursor */
+
+ /** Parameters.
+ *
+ * These values are set once at the start of the TSO send and do
+ * not get changed as the routine progresses.
+ */
+ struct {
+ /* the number of bytes of header */
+ unsigned int header_length;
+
+ /* The number of bytes to put in each outgoing segment. */
+ int full_packet_size;
+
+ /* Current IP ID, host endian. */
+ unsigned ip_id;
+
+ /* Max size of each output packet payload */
+ int gso_size;
+ } p;
+};
+
+
+/**
+ * Verify that our various assumptions about sk_buffs and the conditions
+ * under which TSO will be attempted hold true.
+ *
+ * @v skb The sk_buff to check.
+ */
+static inline void tso_check_safe(struct sk_buff *skb) {
+ EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
+ EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
+ EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
+ EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
+}
+
+
+
+/** Parse the SKB header and initialise state. */
+static inline void tso_start(struct netfront_accel_tso_state *st,
+ struct sk_buff *skb) {
+
+ /*
+ * All ethernet/IP/TCP headers combined size is TCP header size
+ * plus offset of TCP header relative to start of packet.
+ */
+ st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
+ st->p.full_packet_size = (st->p.header_length
+ + skb_shinfo(skb)->gso_size);
+ st->p.gso_size = skb_shinfo(skb)->gso_size;
+
+ st->p.ip_id = htons(ip_hdr(skb)->id);
+ st->seqnum = ntohl(tcp_hdr(skb)->seq);
+
+ EPRINTK_ON(tcp_hdr(skb)->urg);
+ EPRINTK_ON(tcp_hdr(skb)->syn);
+ EPRINTK_ON(tcp_hdr(skb)->rst);
+
+ st->remaining_len = skb->len - st->p.header_length;
+
+ st->output_packets = NULL;
+ st->buffers = 0;
+ st->packets = 0;
+
+ VPRINTK("Starting new TSO: hl %d ps %d gso %d seq %x len %d\n",
+ st->p.header_length, st->p.full_packet_size, st->p.gso_size,
+ st->seqnum, skb->len);
+}
+
+/**
+ * Add another NIC mapped buffer onto an output packet
+ */
+static inline int tso_start_new_buffer(netfront_accel_vnic *vnic,
+ struct netfront_accel_tso_state *st,
+ int first)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ struct netfront_accel_pkt_desc *buf;
+
+ /* Get a mapped packet buffer */
+ buf = netfront_accel_buf_get(vnic->tx_bufs);
+ if (buf == NULL) {
+ DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+ return -1;
+ }
+
+ /* Store a bit of meta-data at the end */
+ tso_buf =(struct netfront_accel_tso_buffer *)
+ (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH
+ + sizeof(struct netfront_accel_tso_output_packet));
+
+ tso_buf->buf = buf;
+
+ tso_buf->length = 0;
+
+ if (first) {
+ struct netfront_accel_tso_output_packet *output_packet
+ = (struct netfront_accel_tso_output_packet *)
+ (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH);
+ output_packet->next = st->output_packets;
+ st->output_packets = output_packet;
+ tso_buf->next = NULL;
+ st->output_packets->tso_bufs = tso_buf;
+ st->output_packets->tso_bufs_len = 1;
+ } else {
+ tso_buf->next = st->output_packets->tso_bufs;
+ st->output_packets->tso_bufs = tso_buf;
+ st->output_packets->tso_bufs_len ++;
+ }
+
+ BUG_ON(st->output_packets->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
+
+ st->buffers ++;
+
+ /*
+ * Store the context, set to NULL, last packet buffer will get
+ * non-NULL later
+ */
+ tso_buf->buf->skb = NULL;
+
+ return 0;
+}
+
+
+/* Generate a new header, and prepare for the new packet.
+ *
+ * @v vnic VNIC
+ * @v skb Socket buffer
+ * @v st TSO state
+ * @ret rc 0 on success, or -1 if failed to alloc header
+ */
+
+static inline
+int tso_start_new_packet(netfront_accel_vnic *vnic,
+ struct sk_buff *skb,
+ struct netfront_accel_tso_state *st)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ struct iphdr *tsoh_iph;
+ struct tcphdr *tsoh_th;
+ unsigned ip_length;
+
+ if (tso_start_new_buffer(vnic, st, 1) < 0) {
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+ return -1;
+ }
+
+ /* This has been set up by tso_start_new_buffer() */
+ tso_buf = st->output_packets->tso_bufs;
+
+ /* Copy in the header */
+ memcpy(tso_buf->buf->pkt_kva, skb->data, st->p.header_length);
+ tso_buf->length = st->p.header_length;
+
+ tsoh_th = (struct tcphdr*)
+ (tso_buf->buf->pkt_kva + SKB_TCP_OFF(skb));
+ tsoh_iph = (struct iphdr*)
+ (tso_buf->buf->pkt_kva + SKB_IP_OFF(skb));
+
+ /* Set to zero to encourage falcon to fill these in */
+ tsoh_th->check = 0;
+ tsoh_iph->check = 0;
+
+ tsoh_th->seq = htonl(st->seqnum);
+ st->seqnum += st->p.gso_size;
+
+ if (st->remaining_len > st->p.gso_size) {
+ /* This packet will not finish the TSO burst. */
+ ip_length = st->p.full_packet_size - ETH_HDR_LEN(skb);
+ tsoh_th->fin = 0;
+ tsoh_th->psh = 0;
+ } else {
+ /* This packet will be the last in the TSO burst. */
+ ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
+ + st->remaining_len);
+ tsoh_th->fin = tcp_hdr(skb)->fin;
+ tsoh_th->psh = tcp_hdr(skb)->psh;
+ }
+
+ tsoh_iph->tot_len = htons(ip_length);
+
+ /* Linux leaves suitable gaps in the IP ID space for us to fill. */
+ tsoh_iph->id = st->p.ip_id++;
+ tsoh_iph->id = htons(tsoh_iph->id);
+
+ st->packet_space = st->p.gso_size;
+
+ st->packets++;
+
+ return 0;
+}
+
+
+
+static inline void tso_get_fragment(struct netfront_accel_tso_state *st,
+ int len, void *addr)
+{
+ st->ifc.len = len;
+ st->ifc.addr = addr;
+ return;
+}
+
+
+static inline void tso_unwind(netfront_accel_vnic *vnic,
+ struct netfront_accel_tso_state *st)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ struct netfront_accel_tso_output_packet *output_packet;
+
+ DPRINTK("%s\n", __FUNCTION__);
+
+ while (st->output_packets != NULL) {
+ output_packet = st->output_packets;
+ st->output_packets = output_packet->next;
+ while (output_packet->tso_bufs != NULL) {
+ tso_buf = output_packet->tso_bufs;
+ output_packet->tso_bufs = tso_buf->next;
+
+ st->buffers --;
+ output_packet->tso_bufs_len --;
+
+ netfront_accel_buf_put(vnic->tx_bufs,
+ tso_buf->buf->buf_id);
+ }
+ }
+ BUG_ON(st->buffers != 0);
+}
+
+
+
+static inline
+void tso_fill_packet_with_fragment(netfront_accel_vnic *vnic,
+ struct netfront_accel_tso_state *st)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ int n, space;
+
+ BUG_ON(st->output_packets == NULL);
+ BUG_ON(st->output_packets->tso_bufs == NULL);
+
+ tso_buf = st->output_packets->tso_bufs;
+
+ if (st->ifc.len == 0) return;
+ if (st->packet_space == 0) return;
+ if (tso_buf->length == NETFRONT_ACCEL_TSO_BUF_LENGTH) return;
+
+ n = min(st->ifc.len, st->packet_space);
+
+ space = NETFRONT_ACCEL_TSO_BUF_LENGTH - tso_buf->length;
+ n = min(n, space);
+
+ st->packet_space -= n;
+ st->remaining_len -= n;
+ st->ifc.len -= n;
+
+ memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
+
+ tso_buf->length += n;
+
+ BUG_ON(tso_buf->length > NETFRONT_ACCEL_TSO_BUF_LENGTH);
+
+ st->ifc.addr += n;
+
+ return;
+}
+
+
+int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
+ struct sk_buff *skb)
+{
+ struct netfront_accel_tso_state state;
+ struct netfront_accel_tso_buffer *tso_buf = NULL;
+ struct netfront_accel_tso_output_packet *reversed_list = NULL;
+ struct netfront_accel_tso_output_packet *tmp_pkt;
+ ef_iovec iovecs[ACCEL_TSO_MAX_BUFFERS];
+ int frag_i, rc, dma_id;
+ skb_frag_t *f;
+
+ tso_check_safe(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ EPRINTK("Trying to TSO send a packet without HW checksum\n");
+
+ tso_start(&state, skb);
+
+ /*
+ * Setup the first payload fragment. If the skb header area
+ * contains exactly the headers and all payload is in the frag
+ * list things are little simpler
+ */
+ if (skb_headlen(skb) == state.p.header_length) {
+ /* Grab the first payload fragment. */
+ BUG_ON(skb_shinfo(skb)->nr_frags < 1);
+ frag_i = 0;
+ f = &skb_shinfo(skb)->frags[frag_i];
+ tso_get_fragment(&state, f->size,
+ page_address(f->page) + f->page_offset);
+ } else {
+ int hl = state.p.header_length;
+ tso_get_fragment(&state, skb_headlen(skb) - hl,
+ skb->data + hl);
+ frag_i = -1;
+ }
+
+ if (tso_start_new_packet(vnic, skb, &state) < 0) {
+ DPRINTK("%s: out of first start-packet memory\n",
+ __FUNCTION__);
+ goto unwind;
+ }
+
+ while (1) {
+ tso_fill_packet_with_fragment(vnic, &state);
+
+ /* Move onto the next fragment? */
+ if (state.ifc.len == 0) {
+ if (++frag_i >= skb_shinfo(skb)->nr_frags)
+ /* End of payload reached. */
+ break;
+ f = &skb_shinfo(skb)->frags[frag_i];
+ tso_get_fragment(&state, f->size,
+ page_address(f->page) +
+ f->page_offset);
+ }
+
+ /* Start a new buffer? */
+ if ((state.output_packets->tso_bufs->length ==
+ NETFRONT_ACCEL_TSO_BUF_LENGTH) &&
+ tso_start_new_buffer(vnic, &state, 0)) {
+ DPRINTK("%s: out of start-buffer memory\n",
+ __FUNCTION__);
+ goto unwind;
+ }
+
+ /* Start at new packet? */
+ if ((state.packet_space == 0 ||
+ ((state.output_packets->tso_bufs_len >=
+ ACCEL_TSO_MAX_BUFFERS) &&
+ (state.output_packets->tso_bufs->length >=
+ NETFRONT_ACCEL_TSO_BUF_LENGTH))) &&
+ tso_start_new_packet(vnic, skb, &state) < 0) {
+ DPRINTK("%s: out of start-packet memory\n",
+ __FUNCTION__);
+ goto unwind;
+ }
+
+ }
+
+ /* Check for space */
+ if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
+ DPRINTK("%s: Not enough TX space (%d)\n",
+ __FUNCTION__, state.buffers);
+ goto unwind;
+ }
+
+ /*
+ * Store the skb context in the most recent buffer (i.e. the
+ * last buffer that will be sent)
+ */
+ state.output_packets->tso_bufs->buf->skb = skb;
+
+ /* Reverse the list of packets as we construct it on a stack */
+ while (state.output_packets != NULL) {
+ tmp_pkt = state.output_packets;
+ state.output_packets = tmp_pkt->next;
+ tmp_pkt->next = reversed_list;
+ reversed_list = tmp_pkt;
+ }
+
+ /* Pass off to hardware */
+ while (reversed_list != NULL) {
+ tmp_pkt = reversed_list;
+ reversed_list = tmp_pkt->next;
+
+ BUG_ON(tmp_pkt->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
+ BUG_ON(tmp_pkt->tso_bufs_len == 0);
+
+ dma_id = tmp_pkt->tso_bufs->buf->buf_id;
+
+ /*
+ * Make an iovec of the buffers in the list, reversing
+ * the buffers as we go as they are constructed on a
+ * stack
+ */
+ tso_buf = tmp_pkt->tso_bufs;
+ for (frag_i = tmp_pkt->tso_bufs_len - 1;
+ frag_i >= 0;
+ frag_i--) {
+ iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
+ iovecs[frag_i].iov_len = tso_buf->length;
+ tso_buf = tso_buf->next;
+ }
+
+ rc = ef_vi_transmitv(&vnic->vi, iovecs, tmp_pkt->tso_bufs_len,
+ dma_id);
+ /*
+ * We checked for space already, so it really should
+ * succeed
+ */
+ BUG_ON(rc != 0);
+ }
+
+ /* Track number of tx fastpath stats */
+ vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+ vnic->netdev_stats.fastpath_tx_pkts += state.packets;
+#if NETFRONT_ACCEL_STATS
+ {
+ unsigned n;
+ n = vnic->netdev_stats.fastpath_tx_pkts -
+ vnic->stats.fastpath_tx_completions;
+ if (n > vnic->stats.fastpath_tx_pending_max)
+ vnic->stats.fastpath_tx_pending_max = n;
+ }
+#endif
+
+ return NETFRONT_ACCEL_STATUS_GOOD;
+
+ unwind:
+ tso_unwind(vnic, &state);
+
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+
+ return NETFRONT_ACCEL_STATUS_BUSY;
+}
+
+
+
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_TSO_H
+#define NETFRONT_ACCEL_TSO_H
+
+#include "accel_bufs.h"
+
+/* Track the buffers used in each output packet */
+struct netfront_accel_tso_buffer {
+ struct netfront_accel_tso_buffer *next;
+ struct netfront_accel_pkt_desc *buf;
+ unsigned length;
+};
+
+/* Track the output packets formed from each input packet */
+struct netfront_accel_tso_output_packet {
+ struct netfront_accel_tso_output_packet *next;
+ struct netfront_accel_tso_buffer *tso_bufs;
+ unsigned tso_bufs_len;
+};
+
+
+/*
+ * Max available space in a buffer for data once meta-data has taken
+ * its place
+ */
+#define NETFRONT_ACCEL_TSO_BUF_LENGTH \
+ ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE) \
+ - sizeof(struct netfront_accel_tso_buffer) \
+ - sizeof(struct netfront_accel_tso_output_packet))
+
+int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
+ struct sk_buff *skb);
+
+#endif /* NETFRONT_ACCEL_TSO_H */
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <asm/io.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+#include "accel_tso.h"
+#include "accel_ssr.h"
+#include "netfront.h"
+
+#include "etherfabric/ef_vi.h"
+
+/*
+ * Max available space in a buffer for data once meta-data has taken
+ * its place
+ */
+#define NETFRONT_ACCEL_TX_BUF_LENGTH \
+ ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE) \
+ - sizeof(struct netfront_accel_tso_buffer))
+
+#define ACCEL_TX_MAX_BUFFERS (6)
+#define ACCEL_VI_POLL_EVENTS (8)
+
+static
+int netfront_accel_vi_init_fini(netfront_accel_vnic *vnic,
+ struct net_accel_msg_hw *hw_msg)
+{
+ struct ef_vi_nic_type nic_type;
+ struct net_accel_hw_falcon_b *hw_info;
+ void *io_kva, *evq_base, *rx_dma_kva, *tx_dma_kva, *doorbell_kva;
+ u32 *evq_gnts;
+ u32 evq_order;
+ int vi_state_size;
+ u8 vi_data[VI_MAPPINGS_SIZE];
+
+ if (hw_msg == NULL)
+ goto fini;
+
+ /* And create the local macs table lock */
+ spin_lock_init(&vnic->table_lock);
+
+ /* Create fastpath table, initial size 8, key length 8 */
+ if (cuckoo_hash_init(&vnic->fastpath_table, 3, 8)) {
+ EPRINTK("failed to allocate fastpath table\n");
+ goto fail_cuckoo;
+ }
+
+ vnic->hw.falcon.type = hw_msg->type;
+
+ switch (hw_msg->type) {
+ case NET_ACCEL_MSG_HWTYPE_FALCON_A:
+ hw_info = &hw_msg->resources.falcon_a.common;
+ /* Need the extra rptr register page on A1 */
+ io_kva = net_accel_map_iomem_page
+ (vnic->dev, hw_msg->resources.falcon_a.evq_rptr_gnt,
+ &vnic->hw.falcon.evq_rptr_mapping);
+ if (io_kva == NULL) {
+ EPRINTK("%s: evq_rptr permission failed\n", __FUNCTION__);
+ goto evq_rptr_fail;
+ }
+
+ vnic->hw.falcon.evq_rptr = io_kva +
+ (hw_info->evq_rptr & (PAGE_SIZE - 1));
+ break;
+ case NET_ACCEL_MSG_HWTYPE_FALCON_B:
+ hw_info = &hw_msg->resources.falcon_b;
+ break;
+ default:
+ goto bad_type;
+ }
+
+ /**** Event Queue ****/
+
+ /* Map the event queue pages */
+ evq_gnts = hw_info->evq_mem_gnts;
+ evq_order = hw_info->evq_order;
+
+ EPRINTK_ON(hw_info->evq_offs != 0);
+
+ DPRINTK("Will map evq %d pages\n", 1 << evq_order);
+
+ evq_base =
+ net_accel_map_grants_contig(vnic->dev, evq_gnts, 1 << evq_order,
+ &vnic->evq_mapping);
+ if (evq_base == NULL) {
+ EPRINTK("%s: evq_base failed\n", __FUNCTION__);
+ goto evq_fail;
+ }
+
+ /**** Doorbells ****/
+ /* Set up the doorbell mappings. */
+ doorbell_kva =
+ net_accel_map_iomem_page(vnic->dev, hw_info->doorbell_gnt,
+ &vnic->hw.falcon.doorbell_mapping);
+ if (doorbell_kva == NULL) {
+ EPRINTK("%s: doorbell permission failed\n", __FUNCTION__);
+ goto doorbell_fail;
+ }
+ vnic->hw.falcon.doorbell = doorbell_kva;
+
+ /* On Falcon_B we get the rptr from the doorbell page */
+ if (hw_msg->type == NET_ACCEL_MSG_HWTYPE_FALCON_B) {
+ vnic->hw.falcon.evq_rptr =
+ (u32 *)((char *)vnic->hw.falcon.doorbell
+ + hw_info->evq_rptr);
+ }
+
+ /**** DMA Queue ****/
+
+ /* Set up the DMA Queues from the message. */
+ tx_dma_kva = net_accel_map_grants_contig
+ (vnic->dev, &(hw_info->txdmaq_gnt), 1,
+ &vnic->hw.falcon.txdmaq_mapping);
+ if (tx_dma_kva == NULL) {
+ EPRINTK("%s: TX dma failed\n", __FUNCTION__);
+ goto tx_dma_fail;
+ }
+
+ rx_dma_kva = net_accel_map_grants_contig
+ (vnic->dev, &(hw_info->rxdmaq_gnt), 1,
+ &vnic->hw.falcon.rxdmaq_mapping);
+ if (rx_dma_kva == NULL) {
+ EPRINTK("%s: RX dma failed\n", __FUNCTION__);
+ goto rx_dma_fail;
+ }
+
+ /* Full confession */
+ DPRINTK("Mapped H/W"
+ " Tx DMAQ grant %x -> %p\n"
+ " Rx DMAQ grant %x -> %p\n"
+ " EVQ grant %x -> %p\n",
+ hw_info->txdmaq_gnt, tx_dma_kva,
+ hw_info->rxdmaq_gnt, rx_dma_kva,
+ evq_gnts[0], evq_base
+ );
+
+ memset(vi_data, 0, sizeof(vi_data));
+
+ /* TODO BUG11305: convert efhw_arch to ef_vi_arch
+ * e.g.
+ * arch = ef_vi_arch_from_efhw_arch(hw_info->nic_arch);
+ * assert(arch >= 0);
+ * nic_type.arch = arch;
+ */
+ nic_type.arch = (unsigned char)hw_info->nic_arch;
+ nic_type.variant = (char)hw_info->nic_variant;
+ nic_type.revision = (unsigned char)hw_info->nic_revision;
+
+ ef_vi_init_mapping_evq(vi_data, nic_type, hw_info->instance,
+ 1 << (evq_order + PAGE_SHIFT), evq_base,
+ (void *)0xdeadbeef);
+
+ ef_vi_init_mapping_vi(vi_data, nic_type, hw_info->rx_capacity,
+ hw_info->tx_capacity, hw_info->instance,
+ doorbell_kva, rx_dma_kva, tx_dma_kva, 0);
+
+ vi_state_size = ef_vi_calc_state_bytes(hw_info->rx_capacity,
+ hw_info->tx_capacity);
+ vnic->vi_state = (ef_vi_state *)kmalloc(vi_state_size, GFP_KERNEL);
+ if (vnic->vi_state == NULL) {
+ EPRINTK("%s: kmalloc for VI state failed\n", __FUNCTION__);
+ goto vi_state_fail;
+ }
+ ef_vi_init(&vnic->vi, vi_data, vnic->vi_state, &vnic->evq_state, 0);
+
+ ef_eventq_state_init(&vnic->vi);
+
+ ef_vi_state_init(&vnic->vi);
+
+ return 0;
+
+fini:
+ kfree(vnic->vi_state);
+ vnic->vi_state = NULL;
+vi_state_fail:
+ net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.rxdmaq_mapping);
+rx_dma_fail:
+ net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.txdmaq_mapping);
+tx_dma_fail:
+ net_accel_unmap_iomem_page(vnic->dev, vnic->hw.falcon.doorbell_mapping);
+ vnic->hw.falcon.doorbell = NULL;
+doorbell_fail:
+ net_accel_unmap_grants_contig(vnic->dev, vnic->evq_mapping);
+evq_fail:
+ if (vnic->hw.falcon.type == NET_ACCEL_MSG_HWTYPE_FALCON_A)
+ net_accel_unmap_iomem_page(vnic->dev,
+ vnic->hw.falcon.evq_rptr_mapping);
+ vnic->hw.falcon.evq_rptr = NULL;
+evq_rptr_fail:
+bad_type:
+ cuckoo_hash_destroy(&vnic->fastpath_table);
+fail_cuckoo:
+ return -EIO;
+}
+
+
+void netfront_accel_vi_ctor(netfront_accel_vnic *vnic)
+{
+ /* Just mark the VI as uninitialised. */
+ vnic->vi_state = NULL;
+}
+
+
+int netfront_accel_vi_init(netfront_accel_vnic *vnic, struct net_accel_msg_hw *hw_msg)
+{
+ BUG_ON(hw_msg == NULL);
+ return netfront_accel_vi_init_fini(vnic, hw_msg);
+}
+
+
+void netfront_accel_vi_dtor(netfront_accel_vnic *vnic)
+{
+ if (vnic->vi_state != NULL)
+ netfront_accel_vi_init_fini(vnic, NULL);
+}
+
+
+static
+void netfront_accel_vi_post_rx(netfront_accel_vnic *vnic, u16 id,
+ netfront_accel_pkt_desc *buf)
+{
+
+ int idx = vnic->rx_dma_batched;
+
+#if 0
+ VPRINTK("Posting buffer %d (0x%08x) for rx at index %d, space is %d\n",
+ id, buf->pkt_buff_addr, idx, ef_vi_receive_space(&vnic->vi));
+#endif
+ /* Set up a virtual buffer descriptor */
+ ef_vi_receive_init(&vnic->vi, buf->pkt_buff_addr, id,
+ /*rx_bytes=max*/0);
+
+ idx++;
+
+ vnic->rx_dma_level++;
+
+ /*
+ * Only push the descriptor to the card if we've reached the
+ * batch size. Otherwise, the descriptors can sit around for
+ * a while. There will be plenty available.
+ */
+ if (idx >= NETFRONT_ACCEL_RX_DESC_BATCH ||
+ vnic->rx_dma_level < NETFRONT_ACCEL_RX_DESC_BATCH) {
+#if 0
+ VPRINTK("Flushing %d rx descriptors.\n", idx);
+#endif
+
+ /* Push buffer to hardware */
+ ef_vi_receive_push(&vnic->vi);
+
+ idx = 0;
+ }
+
+ vnic->rx_dma_batched = idx;
+}
+
+
+inline
+void netfront_accel_vi_post_rx_or_free(netfront_accel_vnic *vnic, u16 id,
+ netfront_accel_pkt_desc *buf)
+{
+
+ VPRINTK("%s: %d\n", __FUNCTION__, id);
+
+ if (ef_vi_receive_space(&vnic->vi) <= vnic->rx_dma_batched) {
+ VPRINTK("RX space is full\n");
+ netfront_accel_buf_put(vnic->rx_bufs, id);
+ return;
+ }
+
+ VPRINTK("Completed buffer %d is reposted\n", id);
+ netfront_accel_vi_post_rx(vnic, id, buf);
+
+ /*
+ * Let's see if there's any more to be pushed out to the NIC
+ * while we're here
+ */
+ while (ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
+ /* Try to allocate a buffer. */
+ buf = netfront_accel_buf_get(vnic->rx_bufs);
+ if (buf == NULL)
+ break;
+
+ /* Add it to the rx dma queue. */
+ netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);
+ }
+}
+
+
+void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx)
+{
+
+ while (is_rx &&
+ ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
+ netfront_accel_pkt_desc *buf;
+
+ VPRINTK("%s: %d\n", __FUNCTION__, vnic->rx_dma_level);
+
+ /* Try to allocate a buffer. */
+ buf = netfront_accel_buf_get(vnic->rx_bufs);
+
+ if (buf == NULL)
+ break;
+
+ /* Add it to the rx dma queue. */
+ netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);
+ }
+
+ VPRINTK("%s: done\n", __FUNCTION__);
+}
+
+
+struct netfront_accel_multi_state {
+ unsigned remaining_len;
+
+ unsigned buffers;
+
+ struct netfront_accel_tso_buffer *output_buffers;
+
+ /* Where we are in the current fragment of the SKB. */
+ struct {
+ /* address of current position */
+ void *addr;
+ /* remaining length */
+ unsigned int len;
+ } ifc; /* == Input Fragment Cursor */
+};
+
+
+static inline void multi_post_start(struct netfront_accel_multi_state *st,
+ struct sk_buff *skb)
+{
+ st->remaining_len = skb->len;
+ st->output_buffers = NULL;
+ st->buffers = 0;
+ st->ifc.len = skb_headlen(skb);
+ st->ifc.addr = skb->data;
+}
+
+static int multi_post_start_new_buffer(netfront_accel_vnic *vnic,
+ struct netfront_accel_multi_state *st)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ struct netfront_accel_pkt_desc *buf;
+
+ /* Get a mapped packet buffer */
+ buf = netfront_accel_buf_get(vnic->tx_bufs);
+ if (buf == NULL) {
+ DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+ return -1;
+ }
+
+ /* Store a bit of meta-data at the end */
+ tso_buf = (struct netfront_accel_tso_buffer *)
+ (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+ tso_buf->buf = buf;
+
+ tso_buf->length = 0;
+
+ tso_buf->next = st->output_buffers;
+ st->output_buffers = tso_buf;
+ st->buffers++;
+
+ BUG_ON(st->buffers >= ACCEL_TX_MAX_BUFFERS);
+
+ /*
+ * Store the context, set to NULL, last packet buffer will get
+ * non-NULL later
+ */
+ tso_buf->buf->skb = NULL;
+
+ return 0;
+}
+
+
+static void
+multi_post_fill_buffer_with_fragment(netfront_accel_vnic *vnic,
+ struct netfront_accel_multi_state *st)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ unsigned n, space;
+
+ BUG_ON(st->output_buffers == NULL);
+ tso_buf = st->output_buffers;
+
+ if (st->ifc.len == 0) return;
+ if (tso_buf->length == NETFRONT_ACCEL_TX_BUF_LENGTH) return;
+
+ BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+ space = NETFRONT_ACCEL_TX_BUF_LENGTH - tso_buf->length;
+ n = min(st->ifc.len, space);
+
+ memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
+
+ st->remaining_len -= n;
+ st->ifc.len -= n;
+ tso_buf->length += n;
+ st->ifc.addr += n;
+
+ BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+ return;
+}
+
+
+static inline void multi_post_unwind(netfront_accel_vnic *vnic,
+ struct netfront_accel_multi_state *st)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+
+ DPRINTK("%s\n", __FUNCTION__);
+
+ while (st->output_buffers != NULL) {
+ tso_buf = st->output_buffers;
+ st->output_buffers = tso_buf->next;
+ st->buffers--;
+ netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
+ }
+ BUG_ON(st->buffers != 0);
+}
+
+
+static enum netfront_accel_post_status
+netfront_accel_enqueue_skb_multi(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ struct netfront_accel_multi_state state;
+ ef_iovec iovecs[ACCEL_TX_MAX_BUFFERS];
+ skb_frag_t *f;
+ int frag_i, rc, dma_id;
+
+ multi_post_start(&state, skb);
+
+ frag_i = -1;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ /* Set to zero to encourage falcon to work it out for us */
+ *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
+ }
+
+ if (multi_post_start_new_buffer(vnic, &state)) {
+ DPRINTK("%s: out of buffers\n", __FUNCTION__);
+ goto unwind;
+ }
+
+ while (1) {
+ multi_post_fill_buffer_with_fragment(vnic, &state);
+
+ /* Move onto the next fragment? */
+ if (state.ifc.len == 0) {
+ if (++frag_i >= skb_shinfo(skb)->nr_frags)
+ /* End of payload reached. */
+ break;
+ f = &skb_shinfo(skb)->frags[frag_i];
+ state.ifc.len = f->size;
+ state.ifc.addr = page_address(f->page) + f->page_offset;
+ }
+
+ /* Start a new buffer? */
+ if ((state.output_buffers->length ==
+ NETFRONT_ACCEL_TX_BUF_LENGTH) &&
+ multi_post_start_new_buffer(vnic, &state)) {
+ DPRINTK("%s: out of buffers\n", __FUNCTION__);
+ goto unwind;
+ }
+ }
+
+ /* Check for space */
+ if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
+ DPRINTK("%s: Not enough TX space (%d)\n", __FUNCTION__, state.buffers);
+ goto unwind;
+ }
+
+ /* Store the skb in what will be the last buffer's context */
+ state.output_buffers->buf->skb = skb;
+ /* Remember dma_id of what will be the last buffer */
+ dma_id = state.output_buffers->buf->buf_id;
+
+ /*
+ * Make an iovec of the buffers in the list, reversing the
+ * buffers as we go as they are constructed on a stack
+ */
+ tso_buf = state.output_buffers;
+ for (frag_i = state.buffers-1; frag_i >= 0; frag_i--) {
+ iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
+ iovecs[frag_i].iov_len = tso_buf->length;
+ tso_buf = tso_buf->next;
+ }
+
+ rc = ef_vi_transmitv(&vnic->vi, iovecs, state.buffers, dma_id);
+
+ /* Track number of tx fastpath stats */
+ vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+ vnic->netdev_stats.fastpath_tx_pkts ++;
+#if NETFRONT_ACCEL_STATS
+ {
+ u32 n;
+ n = vnic->netdev_stats.fastpath_tx_pkts -
+ (u32)vnic->stats.fastpath_tx_completions;
+ if (n > vnic->stats.fastpath_tx_pending_max)
+ vnic->stats.fastpath_tx_pending_max = n;
+ }
+#endif
+ return NETFRONT_ACCEL_STATUS_GOOD;
+
+unwind:
+ multi_post_unwind(vnic, &state);
+
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+
+ return NETFRONT_ACCEL_STATUS_BUSY;
+}
+
+
+static enum netfront_accel_post_status
+netfront_accel_enqueue_skb_single(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+ struct netfront_accel_tso_buffer *tso_buf;
+ struct netfront_accel_pkt_desc *buf;
+ u8 *kva;
+ int rc;
+
+ if (ef_vi_transmit_space(&vnic->vi) < 1) {
+ DPRINTK("%s: No TX space\n", __FUNCTION__);
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+ return NETFRONT_ACCEL_STATUS_BUSY;
+ }
+
+ buf = netfront_accel_buf_get(vnic->tx_bufs);
+ if (buf == NULL) {
+ DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+ return NETFRONT_ACCEL_STATUS_BUSY;
+ }
+
+ /* Track number of tx fastpath stats */
+ vnic->netdev_stats.fastpath_tx_pkts++;
+ vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+
+#if NETFRONT_ACCEL_STATS
+ {
+ u32 n;
+ n = vnic->netdev_stats.fastpath_tx_pkts -
+ (u32)vnic->stats.fastpath_tx_completions;
+ if (n > vnic->stats.fastpath_tx_pending_max)
+ vnic->stats.fastpath_tx_pending_max = n;
+ }
+#endif
+
+ /* Store the context */
+ buf->skb = skb;
+
+ kva = buf->pkt_kva;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ /* Set to zero to encourage falcon to work it out for us */
+ *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
+ }
+ NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
+ (skb, idx, frag_data, frag_len, {
+ /* Copy in payload */
+ VPRINTK("*** Copying %d bytes to %p\n", frag_len, kva);
+ memcpy(kva, frag_data, frag_len);
+ kva += frag_len;
+ });
+
+ VPRINTK("%s: id %d pkt %p kva %p buff_addr 0x%08x\n", __FUNCTION__,
+ buf->buf_id, buf, buf->pkt_kva, buf->pkt_buff_addr);
+
+
+ /* Set up the TSO meta-data for a single buffer/packet */
+ tso_buf = (struct netfront_accel_tso_buffer *)
+ (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+ tso_buf->next = NULL;
+ tso_buf->buf = buf;
+ tso_buf->length = skb->len;
+
+ rc = ef_vi_transmit(&vnic->vi, buf->pkt_buff_addr, skb->len,
+ buf->buf_id);
+ /* We checked for space already, so it really should succeed */
+ BUG_ON(rc != 0);
+
+ return NETFRONT_ACCEL_STATUS_GOOD;
+}
+
+
+enum netfront_accel_post_status
+netfront_accel_vi_tx_post(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+ struct ethhdr *pkt_eth_hdr;
+ struct iphdr *pkt_ipv4_hdr;
+ int value, try_fastpath;
+
+ /*
+ * This assumes that the data field points to the dest mac
+ * address.
+ */
+ cuckoo_hash_mac_key key = cuckoo_mac_to_key(skb->data);
+
+ /*
+ * NB very important that all things that could return "CANT"
+ * are tested before things that return "BUSY" as if it it
+ * returns "BUSY" it is assumed that it won't return "CANT"
+ * next time it is tried
+ */
+
+ /*
+ * Do a fastpath send if fast path table lookup returns true.
+ * We do this without the table lock and so may get the wrong
+ * answer, but current opinion is that's not a big problem
+ */
+ try_fastpath = cuckoo_hash_lookup(&vnic->fastpath_table,
+ (cuckoo_hash_key *)(&key), &value);
+
+ if (!try_fastpath) {
+ DECLARE_MAC_BUF(buf);
+
+ VPRINTK("try fast path false for mac: %s\n",
+ print_mac(buf, skb->data));
+
+ return NETFRONT_ACCEL_STATUS_CANT;
+ }
+
+ /* Check to see if the packet can be sent. */
+ if (skb_headlen(skb) < sizeof(*pkt_eth_hdr) + sizeof(*pkt_ipv4_hdr)) {
+ EPRINTK("%s: Packet header is too small\n", __FUNCTION__);
+ return NETFRONT_ACCEL_STATUS_CANT;
+ }
+
+ pkt_eth_hdr = (void*)skb->data;
+ pkt_ipv4_hdr = (void*)(pkt_eth_hdr+1);
+
+ if (be16_to_cpu(pkt_eth_hdr->h_proto) != ETH_P_IP) {
+ DPRINTK("%s: Packet is not IPV4 (ether_type=0x%04x)\n", __FUNCTION__,
+ be16_to_cpu(pkt_eth_hdr->h_proto));
+ return NETFRONT_ACCEL_STATUS_CANT;
+ }
+
+ if (pkt_ipv4_hdr->protocol != IPPROTO_TCP &&
+ pkt_ipv4_hdr->protocol != IPPROTO_UDP) {
+ DPRINTK("%s: Packet is not TCP/UDP (ip_protocol=0x%02x)\n",
+ __FUNCTION__, pkt_ipv4_hdr->protocol);
+ return NETFRONT_ACCEL_STATUS_CANT;
+ }
+
+ VPRINTK("%s: %d bytes, gso %d\n", __FUNCTION__, skb->len,
+ skb_shinfo(skb)->gso_size);
+
+ if (skb_shinfo(skb)->gso_size) {
+ return netfront_accel_enqueue_skb_tso(vnic, skb);
+ }
+
+ if (skb->len <= NETFRONT_ACCEL_TX_BUF_LENGTH) {
+ return netfront_accel_enqueue_skb_single(vnic, skb);
+ }
+
+ return netfront_accel_enqueue_skb_multi(vnic, skb);
+}
+
+
+/*
+ * Copy the data to required end destination. NB. len is the total new
+ * length of the socket buffer, not the amount of data to copy
+ */
+inline
+int ef_vnic_copy_to_skb(netfront_accel_vnic *vnic, struct sk_buff *skb,
+ struct netfront_accel_pkt_desc *buf, int len)
+{
+ int i, extra = len - skb->len;
+ char c;
+ int pkt_stride = vnic->rx_pkt_stride;
+ int skb_stride = vnic->rx_skb_stride;
+ char *skb_start;
+
+ /*
+ * This pulls stuff into the cache - have seen performance
+ * benefit in this, but disabled by default
+ */
+ skb_start = skb->data;
+ if (pkt_stride) {
+ for (i = 0; i < len; i += pkt_stride) {
+ c += ((volatile char*)(buf->pkt_kva))[i];
+ }
+ }
+ if (skb_stride) {
+ for (i = skb->len; i < len ; i += skb_stride) {
+ c += ((volatile char*)(skb_start))[i];
+ }
+ }
+
+ if (skb_tailroom(skb) >= extra) {
+ memcpy(skb_put(skb, extra), buf->pkt_kva, extra);
+ return 0;
+ }
+
+ return -ENOSPC;
+}
+
+
+static void discard_jumbo_state(netfront_accel_vnic *vnic)
+{
+
+ if (vnic->jumbo_state.skb != NULL) {
+ dev_kfree_skb_any(vnic->jumbo_state.skb);
+
+ vnic->jumbo_state.skb = NULL;
+ }
+ vnic->jumbo_state.in_progress = 0;
+}
+
+
+static void netfront_accel_vi_rx_complete(netfront_accel_vnic *vnic,
+ struct sk_buff *skb)
+{
+ cuckoo_hash_mac_key key;
+ unsigned long flags;
+ int value;
+ struct net_device *net_dev;
+
+
+ key = cuckoo_mac_to_key(skb->data + ETH_ALEN);
+
+ /*
+ * If this is a MAC address that we want to do fast path TX
+ * to, and we don't already, add it to the fastpath table.
+ * The initial lookup is done without the table lock and so
+ * may get the wrong answer, but current opinion is that's not
+ * a big problem
+ */
+ if (is_valid_ether_addr(skb->data + ETH_ALEN) &&
+ !cuckoo_hash_lookup(&vnic->fastpath_table, (cuckoo_hash_key *)&key,
+ &value)) {
+ spin_lock_irqsave(&vnic->table_lock, flags);
+
+ cuckoo_hash_add_check(&vnic->fastpath_table,
+ (cuckoo_hash_key *)&key,
+ 1, 1);
+
+ spin_unlock_irqrestore(&vnic->table_lock, flags);
+ }
+
+ if (compare_ether_addr(skb->data, vnic->mac)) {
+ struct iphdr *ip = (struct iphdr *)(skb->data + ETH_HLEN);
+ u16 port;
+ DECLARE_MAC_BUF(buf);
+
+ DPRINTK("%s: saw wrong MAC address %s\n",
+ __FUNCTION__, print_mac(buf, skb->data));
+
+ if (ip->protocol == IPPROTO_TCP) {
+ struct tcphdr *tcp = (struct tcphdr *)
+ ((char *)ip + 4 * ip->ihl);
+ port = tcp->dest;
+ } else {
+ struct udphdr *udp = (struct udphdr *)
+ ((char *)ip + 4 * ip->ihl);
+ EPRINTK_ON(ip->protocol != IPPROTO_UDP);
+ port = udp->dest;
+ }
+
+ netfront_accel_msg_tx_fastpath(vnic, skb->data,
+ ip->daddr, port,
+ ip->protocol);
+ }
+
+ net_dev = vnic->net_dev;
+ skb->dev = net_dev;
+ skb->protocol = eth_type_trans(skb, net_dev);
+ /* CHECKSUM_UNNECESSARY as hardware has done it already */
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (!netfront_accel_ssr_skb(vnic, &vnic->ssr_state, skb))
+ netif_receive_skb(skb);
+}
+
+
+static int netfront_accel_vi_poll_process_rx(netfront_accel_vnic *vnic,
+ ef_event *ev)
+{
+ struct netfront_accel_bufinfo *bufinfo = vnic->rx_bufs;
+ struct netfront_accel_pkt_desc *buf = NULL;
+ struct sk_buff *skb;
+ int id, len, sop = 0, cont = 0;
+
+ VPRINTK("Rx event.\n");
+ /*
+ * Complete the receive operation, and get the request id of
+ * the buffer
+ */
+ id = ef_vi_receive_done(&vnic->vi, ev);
+
+ if (id < 0 || id >= bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE) {
+ EPRINTK("Rx packet %d is invalid\n", id);
+ /* Carry on round the loop if more events */
+ goto bad_packet;
+ }
+ /* Get our buffer descriptor */
+ buf = netfront_accel_buf_find(bufinfo, id);
+
+ len = EF_EVENT_RX_BYTES(*ev);
+
+ /* An RX buffer has been removed from the DMA ring. */
+ vnic->rx_dma_level--;
+
+ if (EF_EVENT_TYPE(*ev) == EF_EVENT_TYPE_RX) {
+ sop = EF_EVENT_RX_SOP(*ev);
+ cont = EF_EVENT_RX_CONT(*ev);
+
+ skb = vnic->jumbo_state.skb;
+
+ VPRINTK("Rx packet %d: %d bytes so far; sop %d; cont %d\n",
+ id, len, sop, cont);
+
+ if (sop) {
+ if (!vnic->jumbo_state.in_progress) {
+ vnic->jumbo_state.in_progress = 1;
+ BUG_ON(vnic->jumbo_state.skb != NULL);
+ } else {
+ /*
+ * This fragment shows a missing tail in
+ * previous one, but is itself possibly OK
+ */
+ DPRINTK("sop and in_progress => no tail\n");
+
+ /* Release the socket buffer we already had */
+ discard_jumbo_state(vnic);
+
+ /* Now start processing this fragment */
+ vnic->jumbo_state.in_progress = 1;
+ skb = NULL;
+ }
+ } else if (!vnic->jumbo_state.in_progress) {
+ DPRINTK("!sop and !in_progress => missing head\n");
+ goto missing_head;
+ }
+
+ if (!cont) {
+ /* Update state for next time */
+ vnic->jumbo_state.in_progress = 0;
+ vnic->jumbo_state.skb = NULL;
+ } else if (!vnic->jumbo_state.in_progress) {
+ DPRINTK("cont and !in_progress => missing head\n");
+ goto missing_head;
+ }
+
+ if (skb == NULL) {
+ BUG_ON(!sop);
+
+ if (!cont)
+ skb = alloc_skb(len+NET_IP_ALIGN, GFP_ATOMIC);
+ else
+ skb = alloc_skb(vnic->net_dev->mtu+NET_IP_ALIGN,
+ GFP_ATOMIC);
+
+ if (skb == NULL) {
+ DPRINTK("%s: Couldn't get an rx skb.\n",
+ __FUNCTION__);
+ netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+ /*
+ * Dropping this fragment means we
+ * should discard the rest too
+ */
+ discard_jumbo_state(vnic);
+
+ /* Carry on round the loop if more events */
+ return 0;
+ }
+
+ }
+
+ /* Copy the data to required end destination */
+ if (ef_vnic_copy_to_skb(vnic, skb, buf, len) != 0) {
+ /*
+ * No space in the skb - suggests > MTU packet
+ * received
+ */
+ EPRINTK("%s: Rx packet too large (%d)\n",
+ __FUNCTION__, len);
+ netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+ discard_jumbo_state(vnic);
+ return 0;
+ }
+
+ /* Put the buffer back in the DMA queue. */
+ netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+
+ if (cont) {
+ vnic->jumbo_state.skb = skb;
+
+ return 0;
+ } else {
+ /* Track number of rx fastpath packets */
+ vnic->netdev_stats.fastpath_rx_pkts++;
+ vnic->netdev_stats.fastpath_rx_bytes += len;
+
+ netfront_accel_vi_rx_complete(vnic, skb);
+
+ return 1;
+ }
+ } else {
+ BUG_ON(EF_EVENT_TYPE(*ev) != EF_EVENT_TYPE_RX_DISCARD);
+
+ if (EF_EVENT_RX_DISCARD_TYPE(*ev)
+ == EF_EVENT_RX_DISCARD_TRUNC) {
+ DPRINTK("%s: " EF_EVENT_FMT
+ " buffer %d FRM_TRUNC q_id %d\n",
+ __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+ EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_frm_trunc);
+ } else if (EF_EVENT_RX_DISCARD_TYPE(*ev)
+ == EF_EVENT_RX_DISCARD_OTHER) {
+ DPRINTK("%s: " EF_EVENT_FMT
+ " buffer %d RX_DISCARD_OTHER q_id %d\n",
+ __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+ EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+ /*
+ * Probably tail of packet for which error has
+ * already been logged, so don't count in
+ * stats
+ */
+ } else {
+ EPRINTK("%s: " EF_EVENT_FMT
+ " buffer %d rx discard type %d q_id %d\n",
+ __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+ EF_EVENT_RX_DISCARD_TYPE(*ev),
+ EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+ NETFRONT_ACCEL_STATS_OP(++vnic->stats.bad_event_count);
+ }
+ }
+
+ /* discard type drops through here */
+
+bad_packet:
+ /* Release the socket buffer we already had */
+ discard_jumbo_state(vnic);
+
+missing_head:
+ BUG_ON(vnic->jumbo_state.in_progress != 0);
+ BUG_ON(vnic->jumbo_state.skb != NULL);
+
+ if (id >= 0 && id < bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE)
+ /* Put the buffer back in the DMA queue. */
+ netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+
+ vnic->netdev_stats.fastpath_rx_errors++;
+
+ DPRINTK("%s experienced bad packet/missing fragment error: %d \n",
+ __FUNCTION__, ev->rx.flags);
+
+ return 0;
+}
+
+
+static void netfront_accel_vi_not_busy(netfront_accel_vnic *vnic)
+{
+ struct netfront_info *np = ((struct netfront_info *)
+ netdev_priv(vnic->net_dev));
+ struct sk_buff *skb;
+ int handled;
+ unsigned long flags;
+
+ /*
+ * TODO if we could safely check tx_skb == NULL and return
+ * early without taking the lock, that would obviously help
+ * performance
+ */
+
+ /* Take the netfront lock which protects tx_skb. */
+ spin_lock_irqsave(&np->tx_lock, flags);
+ if (vnic->tx_skb != NULL) {
+ DPRINTK("%s trying to send spare buffer\n", __FUNCTION__);
+
+ skb = vnic->tx_skb;
+ vnic->tx_skb = NULL;
+
+ spin_unlock_irqrestore(&np->tx_lock, flags);
+
+ handled = netfront_accel_vi_tx_post(vnic, skb);
+
+ spin_lock_irqsave(&np->tx_lock, flags);
+
+ if (handled != NETFRONT_ACCEL_STATUS_BUSY) {
+ DPRINTK("%s restarting tx\n", __FUNCTION__);
+ if (netfront_check_queue_ready(vnic->net_dev)) {
+ netif_wake_queue(vnic->net_dev);
+ NETFRONT_ACCEL_STATS_OP
+ (vnic->stats.queue_wakes++);
+ }
+ } else {
+ vnic->tx_skb = skb;
+ }
+
+ /*
+ * Should never get a CANT, as it checks that before
+ * deciding it was BUSY first time round
+ */
+ BUG_ON(handled == NETFRONT_ACCEL_STATUS_CANT);
+ }
+ spin_unlock_irqrestore(&np->tx_lock, flags);
+}
+
+
+static void netfront_accel_vi_tx_complete(netfront_accel_vnic *vnic,
+ struct netfront_accel_tso_buffer *tso_buf,
+ int is_last)
+{
+ struct netfront_accel_tso_buffer *next;
+
+ /*
+ * We get a single completion for every call to
+ * ef_vi_transmitv so handle any other buffers which are part
+ * of the same packet
+ */
+ while (tso_buf != NULL) {
+ if (tso_buf->buf->skb != NULL) {
+ dev_kfree_skb_any(tso_buf->buf->skb);
+ tso_buf->buf->skb = NULL;
+ }
+
+ next = tso_buf->next;
+
+ netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
+
+ tso_buf = next;
+ }
+
+ /*
+ * If this was the last one in the batch, we try and send any
+ * pending tx_skb. There should now be buffers and
+ * descriptors
+ */
+ if (is_last)
+ netfront_accel_vi_not_busy(vnic);
+}
+
+
+static void netfront_accel_vi_poll_process_tx(netfront_accel_vnic *vnic,
+ ef_event *ev)
+{
+ struct netfront_accel_pkt_desc *buf;
+ struct netfront_accel_tso_buffer *tso_buf;
+ ef_request_id ids[EF_VI_TRANSMIT_BATCH];
+ int i, n_ids;
+ unsigned long flags;
+
+ /* Get the request ids for this tx completion event. */
+ n_ids = ef_vi_transmit_unbundle(&vnic->vi, ev, ids);
+
+ /* Take the tx buffer spin lock and hold for the duration */
+ spin_lock_irqsave(&vnic->tx_lock, flags);
+
+ for (i = 0; i < n_ids; ++i) {
+ VPRINTK("Tx packet %d complete\n", ids[i]);
+ buf = netfront_accel_buf_find(vnic->tx_bufs, ids[i]);
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_completions++);
+
+ tso_buf = (struct netfront_accel_tso_buffer *)
+ (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+ BUG_ON(tso_buf->buf != buf);
+
+ netfront_accel_vi_tx_complete(vnic, tso_buf, i == (n_ids-1));
+ }
+
+ spin_unlock_irqrestore(&vnic->tx_lock, flags);
+}
+
+
+int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets)
+{
+ ef_event ev[ACCEL_VI_POLL_EVENTS];
+ int rx_remain = rx_packets, rc, events, i;
+#if NETFRONT_ACCEL_STATS
+ int n_evs_polled = 0, rx_evs_polled = 0, tx_evs_polled = 0;
+#endif
+ BUG_ON(rx_packets <= 0);
+
+ events = ef_eventq_poll(&vnic->vi, ev,
+ min(rx_remain, ACCEL_VI_POLL_EVENTS));
+ i = 0;
+ NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
+
+ VPRINTK("%s: %d events\n", __FUNCTION__, events);
+
+ /* Loop over each event */
+ while (events) {
+ VPRINTK("%s: Event "EF_EVENT_FMT", index %lu\n", __FUNCTION__,
+ EF_EVENT_PRI_ARG(ev[i]),
+ (unsigned long)(vnic->vi.evq_state->evq_ptr));
+
+ if ((EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX) ||
+ (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX_DISCARD)) {
+ rc = netfront_accel_vi_poll_process_rx(vnic, &ev[i]);
+ rx_remain -= rc;
+ BUG_ON(rx_remain < 0);
+ NETFRONT_ACCEL_STATS_OP(rx_evs_polled++);
+ } else if (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_TX) {
+ netfront_accel_vi_poll_process_tx(vnic, &ev[i]);
+ NETFRONT_ACCEL_STATS_OP(tx_evs_polled++);
+ } else if (EF_EVENT_TYPE(ev[i]) ==
+ EF_EVENT_TYPE_RX_NO_DESC_TRUNC) {
+ DPRINTK("%s: RX_NO_DESC_TRUNC " EF_EVENT_FMT "\n",
+ __FUNCTION__, EF_EVENT_PRI_ARG(ev[i]));
+ discard_jumbo_state(vnic);
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.rx_no_desc_trunc++);
+ } else {
+ EPRINTK("Unexpected event " EF_EVENT_FMT "\n",
+ EF_EVENT_PRI_ARG(ev[i]));
+ NETFRONT_ACCEL_STATS_OP(vnic->stats.bad_event_count++);
+ }
+
+ i++;
+
+ /* Carry on round the loop if more events and more space */
+ if (i == events) {
+ if (rx_remain == 0)
+ break;
+
+ events = ef_eventq_poll(&vnic->vi, ev,
+ min(rx_remain,
+ ACCEL_VI_POLL_EVENTS));
+ i = 0;
+ NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
+ }
+ }
+
+#if NETFRONT_ACCEL_STATS
+ vnic->stats.event_count += n_evs_polled;
+ vnic->stats.event_count_since_irq += n_evs_polled;
+ if (n_evs_polled > vnic->stats.events_per_poll_max)
+ vnic->stats.events_per_poll_max = n_evs_polled;
+ if (rx_evs_polled > vnic->stats.events_per_poll_rx_max)
+ vnic->stats.events_per_poll_rx_max = rx_evs_polled;
+ if (tx_evs_polled > vnic->stats.events_per_poll_tx_max)
+ vnic->stats.events_per_poll_tx_max = tx_evs_polled;
+#endif
+
+ return rx_packets - rx_remain;
+}
+
+
+int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic)
+{
+ u32 sw_evq_ptr;
+
+ VPRINTK("%s: checking for event on %p\n", __FUNCTION__, &vnic->vi.evq_state);
+
+ BUG_ON(vnic == NULL);
+ BUG_ON(vnic->vi.evq_state == NULL);
+
+ /* Do a quick check for an event. */
+ if (ef_eventq_has_event(&vnic->vi)) {
+ VPRINTK("%s: found event\n", __FUNCTION__);
+ return 0;
+ }
+
+ VPRINTK("evq_ptr=0x%08x evq_mask=0x%08x\n",
+ vnic->evq_state.evq_ptr, vnic->vi.evq_mask);
+
+ /* Request a wakeup from the hardware. */
+ sw_evq_ptr = vnic->evq_state.evq_ptr & vnic->vi.evq_mask;
+
+ BUG_ON(vnic->hw.falcon.evq_rptr == NULL);
+
+ VPRINTK("Requesting wakeup at 0x%08x, rptr %p\n", sw_evq_ptr,
+ vnic->hw.falcon.evq_rptr);
+ *(volatile u32 *)(vnic->hw.falcon.evq_rptr) = (sw_evq_ptr >> 3);
+
+ return 1;
+}
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+#include "accel_bufs.h"
+#include "accel_ssr.h"
+/* drivers/xen/netfront/netfront.h */
+#include "netfront.h"
+
+void netfront_accel_set_closing(netfront_accel_vnic *vnic)
+{
+
+ vnic->frontend_state = XenbusStateClosing;
+ net_accel_update_state(vnic->dev, XenbusStateClosing);
+}
+
+
+static void mac_address_change(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ netfront_accel_vnic *vnic;
+ struct xenbus_device *dev;
+ int rc;
+
+ DPRINTK("%s\n", __FUNCTION__);
+
+ vnic = container_of(watch, netfront_accel_vnic,
+ mac_address_watch);
+ dev = vnic->dev;
+
+ rc = net_accel_xen_net_read_mac(dev, vnic->mac);
+
+ if (rc != 0)
+ EPRINTK("%s: failed to read mac (%d)\n", __FUNCTION__, rc);
+}
+
+
+static int setup_mac_address_watch(struct xenbus_device *dev,
+ netfront_accel_vnic *vnic)
+{
+ int err;
+
+ DPRINTK("Setting watch on %s/%s\n", dev->nodename, "mac");
+
+ err = xenbus_watch_path2(dev, dev->nodename, "mac",
+ &vnic->mac_address_watch,
+ mac_address_change);
+ if (err) {
+ EPRINTK("%s: Failed to register xenbus watch: %d\n",
+ __FUNCTION__, err);
+ goto fail;
+ }
+
+ return 0;
+ fail:
+ vnic->mac_address_watch.node = NULL;
+ return err;
+}
+
+
+/* Grant access to some pages and publish through xenbus */
+static int make_named_grant(struct xenbus_device *dev, void *page,
+ const char *name, grant_ref_t *gnt_ref)
+{
+ struct xenbus_transaction tr;
+ int err;
+ grant_ref_t gnt;
+
+ gnt = net_accel_grant_page(dev, virt_to_mfn(page), 0);
+ if (gnt < 0)
+ return gnt;
+
+ do {
+ err = xenbus_transaction_start(&tr);
+ if (err != 0) {
+ EPRINTK("%s: transaction start failed %d\n",
+ __FUNCTION__, err);
+ return err;
+ }
+ err = xenbus_printf(tr, dev->nodename, name, "%d", gnt);
+ if (err != 0) {
+ EPRINTK("%s: xenbus_printf failed %d\n", __FUNCTION__,
+ err);
+ xenbus_transaction_end(tr, 1);
+ return err;
+ }
+ err = xenbus_transaction_end(tr, 0);
+ } while (err == -EAGAIN);
+
+ if (err != 0) {
+ EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
+ return err;
+ }
+
+ *gnt_ref = gnt;
+
+ return 0;
+}
+
+
+static int remove_named_grant(struct xenbus_device *dev,
+ const char *name, grant_ref_t gnt_ref)
+{
+ struct xenbus_transaction tr;
+ int err;
+
+ net_accel_ungrant_page(gnt_ref);
+
+ do {
+ err = xenbus_transaction_start(&tr);
+ if (err != 0) {
+ EPRINTK("%s: transaction start failed %d\n",
+ __FUNCTION__, err);
+ return err;
+ }
+ err = xenbus_rm(tr, dev->nodename, name);
+ if (err != 0) {
+ EPRINTK("%s: xenbus_rm failed %d\n", __FUNCTION__,
+ err);
+ xenbus_transaction_end(tr, 1);
+ return err;
+ }
+ err = xenbus_transaction_end(tr, 0);
+ } while (err == -EAGAIN);
+
+ if (err != 0) {
+ EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
+ return err;
+ }
+
+ return 0;
+}
+
+
+static
+netfront_accel_vnic *netfront_accel_vnic_ctor(struct net_device *net_dev,
+ struct xenbus_device *dev)
+{
+ struct netfront_info *np =
+ (struct netfront_info *)netdev_priv(net_dev);
+ netfront_accel_vnic *vnic;
+ int err;
+
+ /*
+ * A bug in earlier versions of Xen accel plugin system meant
+ * you could be probed twice for the same device on suspend
+ * cancel. Be tolerant of that.
+ */
+ if (np->accel_priv != NULL)
+ return ERR_PTR(-EALREADY);
+
+ /* Alloc mem for state */
+ vnic = kzalloc(sizeof(netfront_accel_vnic), GFP_KERNEL);
+ if (vnic == NULL) {
+ EPRINTK("%s: no memory for vnic state\n", __FUNCTION__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ spin_lock_init(&vnic->tx_lock);
+
+ mutex_init(&vnic->vnic_mutex);
+ mutex_lock(&vnic->vnic_mutex);
+
+ /* Store so state can be retrieved from device */
+ BUG_ON(np->accel_priv != NULL);
+ np->accel_priv = vnic;
+ vnic->dev = dev;
+ vnic->net_dev = net_dev;
+ spin_lock_init(&vnic->irq_enabled_lock);
+ netfront_accel_ssr_init(&vnic->ssr_state);
+
+ init_waitqueue_head(&vnic->state_wait_queue);
+ vnic->backend_state = XenbusStateUnknown;
+ vnic->frontend_state = XenbusStateClosed;
+ vnic->removing = 0;
+ vnic->domU_state_is_setup = 0;
+ vnic->dom0_state_is_setup = 0;
+ vnic->poll_enabled = 0;
+ vnic->tx_enabled = 0;
+ vnic->tx_skb = NULL;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+ INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend);
+#else
+ INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend, vnic);
+#endif
+
+ netfront_accel_debugfs_create(vnic);
+
+ mutex_unlock(&vnic->vnic_mutex);
+
+ err = net_accel_xen_net_read_mac(dev, vnic->mac);
+ if (err)
+ goto fail_mac;
+
+ /* Setup a watch on the frontend's MAC address */
+ err = setup_mac_address_watch(dev, vnic);
+ if (err)
+ goto fail_mac;
+
+ return vnic;
+
+fail_mac:
+
+ mutex_lock(&vnic->vnic_mutex);
+
+ netfront_accel_debugfs_remove(vnic);
+
+ netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
+
+ EPRINTK_ON(vnic->tx_skb != NULL);
+
+ vnic->frontend_state = XenbusStateUnknown;
+ net_accel_update_state(dev, XenbusStateUnknown);
+
+ mutex_unlock(&vnic->vnic_mutex);
+
+ np->accel_priv = NULL;
+ kfree(vnic);
+
+ return ERR_PTR(err);
+}
+
+
+static void netfront_accel_vnic_dtor(netfront_accel_vnic *vnic)
+{
+ struct net_device *net_dev = vnic->net_dev;
+ struct netfront_info *np =
+ (struct netfront_info *)netdev_priv(net_dev);
+
+ /*
+ * Now we don't hold the lock any more it is safe to remove
+ * this watch and synchonrise with the completion of
+ * watches
+ */
+ DPRINTK("%s: unregistering xenbus mac watch\n", __FUNCTION__);
+ unregister_xenbus_watch(&vnic->mac_address_watch);
+ kfree(vnic->mac_address_watch.node);
+
+ flush_workqueue(netfront_accel_workqueue);
+
+ mutex_lock(&vnic->vnic_mutex);
+
+ netfront_accel_debugfs_remove(vnic);
+
+ netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
+
+ EPRINTK_ON(vnic->tx_skb != NULL);
+
+ vnic->frontend_state = XenbusStateUnknown;
+ net_accel_update_state(vnic->dev, XenbusStateUnknown);
+
+ mutex_unlock(&vnic->vnic_mutex);
+
+ np->accel_priv = NULL;
+ kfree(vnic);
+}
+
+
+static int vnic_setup_domU_shared_state(struct xenbus_device *dev,
+ netfront_accel_vnic *vnic)
+{
+ struct xenbus_transaction tr;
+ int err;
+ int msgs_per_queue;
+
+
+ DPRINTK("Setting up domU shared state.\n");
+
+ msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
+
+ /* Allocate buffer state */
+ vnic->tx_bufs = netfront_accel_init_bufs(&vnic->tx_lock);
+ if (vnic->tx_bufs == NULL) {
+ err = -ENOMEM;
+ EPRINTK("%s: Failed to allocate tx buffers\n", __FUNCTION__);
+ goto fail_tx_bufs;
+ }
+
+ vnic->rx_bufs = netfront_accel_init_bufs(NULL);
+ if (vnic->rx_bufs == NULL) {
+ err = -ENOMEM;
+ EPRINTK("%s: Failed to allocate rx buffers\n", __FUNCTION__);
+ goto fail_rx_bufs;
+ }
+
+ /*
+ * This allocates two pages, one for the shared page and one
+ * for the message queue.
+ */
+ vnic->shared_page = (struct net_accel_shared_page *)
+ __get_free_pages(GFP_KERNEL, 1);
+ if (vnic->shared_page == NULL) {
+ EPRINTK("%s: no memory for shared pages\n", __FUNCTION__);
+ err = -ENOMEM;
+ goto fail_shared_page;
+ }
+
+ net_accel_msg_init_queue
+ (&vnic->from_dom0, &vnic->shared_page->queue0,
+ (struct net_accel_msg *)((u8*)vnic->shared_page + PAGE_SIZE),
+ msgs_per_queue);
+
+ net_accel_msg_init_queue
+ (&vnic->to_dom0, &vnic->shared_page->queue1,
+ (struct net_accel_msg *)((u8*)vnic->shared_page +
+ (3 * PAGE_SIZE / 2)),
+ msgs_per_queue);
+
+ vnic->msg_state = NETFRONT_ACCEL_MSG_NONE;
+
+ err = make_named_grant(dev, vnic->shared_page, "accel-ctrl-page",
+ &vnic->ctrl_page_gnt);
+ if (err) {
+ EPRINTK("couldn't make ctrl-page named grant\n");
+ goto fail_ctrl_page_grant;
+ }
+
+ err = make_named_grant(dev, (u8*)vnic->shared_page + PAGE_SIZE,
+ "accel-msg-page", &vnic->msg_page_gnt);
+ if (err) {
+ EPRINTK("couldn't make msg-page named grant\n");
+ goto fail_msg_page_grant;
+ }
+
+ /* Create xenbus msg event channel */
+ err = bind_listening_port_to_irqhandler
+ (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
+ IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
+ if (err < 0) {
+ EPRINTK("Couldn't bind msg event channel\n");
+ goto fail_msg_irq;
+ }
+ vnic->msg_channel_irq = err;
+ vnic->msg_channel = irq_to_evtchn_port(vnic->msg_channel_irq);
+
+ /* Create xenbus net event channel */
+ err = bind_listening_port_to_irqhandler
+ (dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
+ IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
+ if (err < 0) {
+ EPRINTK("Couldn't bind net event channel\n");
+ goto fail_net_irq;
+ }
+ vnic->net_channel_irq = err;
+ vnic->net_channel = irq_to_evtchn_port(vnic->net_channel_irq);
+ /* Want to ensure we don't get interrupts before we're ready */
+ netfront_accel_disable_net_interrupts(vnic);
+
+ DPRINTK("otherend %d has msg ch %u (%u) and net ch %u (%u)\n",
+ dev->otherend_id, vnic->msg_channel, vnic->msg_channel_irq,
+ vnic->net_channel, vnic->net_channel_irq);
+
+ do {
+ err = xenbus_transaction_start(&tr);
+ if (err != 0) {
+ EPRINTK("%s: Transaction start failed %d\n",
+ __FUNCTION__, err);
+ goto fail_transaction;
+ }
+
+ err = xenbus_printf(tr, dev->nodename, "accel-msg-channel",
+ "%u", vnic->msg_channel);
+ if (err != 0) {
+ EPRINTK("%s: event channel xenbus write failed %d\n",
+ __FUNCTION__, err);
+ xenbus_transaction_end(tr, 1);
+ goto fail_transaction;
+ }
+
+ err = xenbus_printf(tr, dev->nodename, "accel-net-channel",
+ "%u", vnic->net_channel);
+ if (err != 0) {
+ EPRINTK("%s: net channel xenbus write failed %d\n",
+ __FUNCTION__, err);
+ xenbus_transaction_end(tr, 1);
+ goto fail_transaction;
+ }
+
+ err = xenbus_transaction_end(tr, 0);
+ } while (err == -EAGAIN);
+
+ if (err != 0) {
+ EPRINTK("%s: Transaction end failed %d\n", __FUNCTION__, err);
+ goto fail_transaction;
+ }
+
+ DPRINTK("Completed setting up domU shared state\n");
+
+ return 0;
+
+fail_transaction:
+
+ unbind_from_irqhandler(vnic->net_channel_irq, vnic);
+fail_net_irq:
+
+ unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
+fail_msg_irq:
+
+ remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
+fail_msg_page_grant:
+
+ remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
+fail_ctrl_page_grant:
+
+ free_pages((unsigned long)vnic->shared_page, 1);
+ vnic->shared_page = NULL;
+fail_shared_page:
+
+ netfront_accel_fini_bufs(vnic->rx_bufs);
+fail_rx_bufs:
+
+ netfront_accel_fini_bufs(vnic->tx_bufs);
+fail_tx_bufs:
+
+ /* Undo the memory allocation created when we got the HELLO */
+ netfront_accel_free_buffer_mem(&vnic->bufpages,
+ vnic->rx_bufs,
+ vnic->tx_bufs);
+
+ DPRINTK("Failed to setup domU shared state with code %d\n", err);
+
+ return err;
+}
+
+
+static void vnic_remove_domU_shared_state(struct xenbus_device *dev,
+ netfront_accel_vnic *vnic)
+{
+ struct xenbus_transaction tr;
+
+ /*
+ * Don't remove any watches because we currently hold the
+ * mutex and the watches take the mutex.
+ */
+
+ DPRINTK("%s: removing event channel irq handlers %d %d\n",
+ __FUNCTION__, vnic->net_channel_irq, vnic->msg_channel_irq);
+ do {
+ if (xenbus_transaction_start(&tr) != 0)
+ break;
+ xenbus_rm(tr, dev->nodename, "accel-msg-channel");
+ xenbus_rm(tr, dev->nodename, "accel-net-channel");
+ } while (xenbus_transaction_end(tr, 0) == -EAGAIN);
+
+ unbind_from_irqhandler(vnic->net_channel_irq, vnic);
+ unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
+
+ /* ungrant pages for msg channel */
+ remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
+ remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
+ free_pages((unsigned long)vnic->shared_page, 1);
+ vnic->shared_page = NULL;
+
+ /* ungrant pages for buffers, and free buffer memory */
+ netfront_accel_free_buffer_mem(&vnic->bufpages,
+ vnic->rx_bufs,
+ vnic->tx_bufs);
+ netfront_accel_fini_bufs(vnic->rx_bufs);
+ netfront_accel_fini_bufs(vnic->tx_bufs);
+}
+
+
+static void vnic_setup_dom0_shared_state(struct xenbus_device *dev,
+ netfront_accel_vnic *vnic)
+{
+ DPRINTK("Setting up dom0 shared state\n");
+
+ netfront_accel_vi_ctor(vnic);
+
+ /*
+ * Message processing will be enabled when this function
+ * returns, but we might have missed an interrupt. Schedule a
+ * check just in case.
+ */
+ queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
+}
+
+
+static void vnic_remove_dom0_shared_state(struct xenbus_device *dev,
+ netfront_accel_vnic *vnic)
+{
+ DPRINTK("Removing dom0 shared state\n");
+
+ vnic_stop_fastpath(vnic);
+
+ netfront_accel_vi_dtor(vnic);
+}
+
+
+/*************************************************************************/
+
+/*
+ * The following code handles accelstate changes between the frontend
+ * and the backend. In response to transitions, calls the following
+ * functions in matching pairs:
+ *
+ * vnic_setup_domU_shared_state
+ * vnic_remove_domU_shared_state
+ *
+ * vnic_setup_dom0_shared_state
+ * vnic_remove_dom0_shared_state
+ *
+ * Valid state transitions for DomU are as follows:
+ *
+ * Closed->Init on probe or in response to Init from dom0
+ *
+ * Init->Connected in response to Init from dom0
+ * Init->Closing on error providing dom0 is in Init
+ * Init->Closed on remove or in response to Closing from dom0
+ *
+ * Connected->Closing on error/remove
+ * Connected->Closed in response to Closing from dom0
+ *
+ * Closing->Closed in response to Closing from dom0
+ *
+ */
+
+
+/* Function to deal with Xenbus accel state change in backend */
+static void netfront_accel_backend_accel_changed(netfront_accel_vnic *vnic,
+ XenbusState backend_state)
+{
+ struct xenbus_device *dev = vnic->dev;
+ XenbusState frontend_state;
+ int state;
+
+ DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
+ __FUNCTION__, xenbus_strstate(vnic->backend_state),
+ xenbus_strstate(backend_state), dev->nodename, dev->otherend);
+
+ /*
+ * Ignore duplicate state changes. This can happen if the
+ * backend changes state twice in quick succession and the
+ * first watch fires in the frontend after the second
+ * transition has completed.
+ */
+ if (vnic->backend_state == backend_state)
+ return;
+
+ vnic->backend_state = backend_state;
+ frontend_state = vnic->frontend_state;
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ /*
+ * It's possible for us to miss the closed state from
+ * dom0, so do the work here.
+ */
+ if (vnic->domU_state_is_setup) {
+ vnic_remove_domU_shared_state(dev, vnic);
+ vnic->domU_state_is_setup = 0;
+ }
+
+ if (frontend_state != XenbusStateInitialising) {
+ /* Make sure the backend doesn't go away. */
+ frontend_state = XenbusStateInitialising;
+ net_accel_update_state(dev, frontend_state);
+ xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
+ backend_state = (XenbusState)state;
+ if (backend_state != XenbusStateInitialising)
+ break;
+ }
+
+ /* Start the new connection. */
+ if (!vnic->removing) {
+ BUG_ON(vnic->domU_state_is_setup);
+ if (vnic_setup_domU_shared_state(dev, vnic) == 0) {
+ vnic->domU_state_is_setup = 1;
+ frontend_state = XenbusStateConnected;
+ } else
+ frontend_state = XenbusStateClosing;
+ }
+ break;
+ case XenbusStateConnected:
+ if (vnic->domU_state_is_setup &&
+ !vnic->dom0_state_is_setup) {
+ vnic_setup_dom0_shared_state(dev, vnic);
+ vnic->dom0_state_is_setup = 1;
+ }
+ break;
+ default:
+ case XenbusStateClosing:
+ if (vnic->dom0_state_is_setup) {
+ vnic_remove_dom0_shared_state(dev, vnic);
+ vnic->dom0_state_is_setup = 0;
+ }
+ frontend_state = XenbusStateClosed;
+ break;
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ if (vnic->domU_state_is_setup) {
+ vnic_remove_domU_shared_state(dev, vnic);
+ vnic->domU_state_is_setup = 0;
+ }
+ break;
+ }
+
+ if (frontend_state != vnic->frontend_state) {
+ DPRINTK("Switching from state %s (%d) to %s (%d)\n",
+ xenbus_strstate(vnic->frontend_state),
+ vnic->frontend_state,
+ xenbus_strstate(frontend_state), frontend_state);
+ vnic->frontend_state = frontend_state;
+ net_accel_update_state(dev, frontend_state);
+ }
+
+ wake_up(&vnic->state_wait_queue);
+}
+
+
+static void backend_accel_state_change(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int state;
+ netfront_accel_vnic *vnic;
+ struct xenbus_device *dev;
+
+ DPRINTK("%s\n", __FUNCTION__);
+
+ vnic = container_of(watch, struct netfront_accel_vnic,
+ backend_accel_watch);
+
+ mutex_lock(&vnic->vnic_mutex);
+
+ dev = vnic->dev;
+
+ state = (int)XenbusStateUnknown;
+ xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
+ netfront_accel_backend_accel_changed(vnic, state);
+
+ mutex_unlock(&vnic->vnic_mutex);
+}
+
+
+static int setup_dom0_accel_watch(struct xenbus_device *dev,
+ netfront_accel_vnic *vnic)
+{
+ int err;
+
+ DPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
+
+ err = xenbus_watch_path2(dev, dev->otherend, "accelstate",
+ &vnic->backend_accel_watch,
+ backend_accel_state_change);
+ if (err) {
+ EPRINTK("%s: Failed to register xenbus watch: %d\n",
+ __FUNCTION__, err);
+ goto fail;
+ }
+ return 0;
+ fail:
+ vnic->backend_accel_watch.node = NULL;
+ return err;
+}
+
+
+int netfront_accel_probe(struct net_device *net_dev, struct xenbus_device *dev)
+{
+ netfront_accel_vnic *vnic;
+ int err;
+
+ DPRINTK("Probe passed device %s\n", dev->nodename);
+
+ vnic = netfront_accel_vnic_ctor(net_dev, dev);
+ if (IS_ERR(vnic))
+ return PTR_ERR(vnic);
+
+ /*
+ * Setup a watch on the backend accel state. This sets things
+ * going.
+ */
+ err = setup_dom0_accel_watch(dev, vnic);
+ if (err) {
+ netfront_accel_vnic_dtor(vnic);
+ EPRINTK("%s: probe failed with code %d\n", __FUNCTION__, err);
+ return err;
+ }
+
+ /*
+ * Indicate to the other end that we're ready to start unless
+ * the watch has already fired.
+ */
+ mutex_lock(&vnic->vnic_mutex);
+ VPRINTK("setup success, updating accelstate\n");
+ if (vnic->frontend_state == XenbusStateClosed) {
+ vnic->frontend_state = XenbusStateInitialising;
+ net_accel_update_state(dev, XenbusStateInitialising);
+ }
+ mutex_unlock(&vnic->vnic_mutex);
+
+ DPRINTK("Probe done device %s\n", dev->nodename);
+
+ return 0;
+}
+
+
+int netfront_accel_remove(struct xenbus_device *dev)
+{
+ struct netfront_info *np =
+ (struct netfront_info *)dev->dev.driver_data;
+ netfront_accel_vnic *vnic = (netfront_accel_vnic *)np->accel_priv;
+
+ DPRINTK("%s %s\n", __FUNCTION__, dev->nodename);
+
+ BUG_ON(vnic == NULL);
+
+ mutex_lock(&vnic->vnic_mutex);
+
+ /* Reject any attempts to connect. */
+ vnic->removing = 1;
+
+ /* Close any existing connection. */
+ if (vnic->frontend_state == XenbusStateConnected) {
+ vnic->frontend_state = XenbusStateClosing;
+ net_accel_update_state(dev, XenbusStateClosing);
+ }
+
+ mutex_unlock(&vnic->vnic_mutex);
+
+ DPRINTK("%s waiting for release of %s\n", __FUNCTION__, dev->nodename);
+
+ /*
+ * Wait for the xenbus watch to release the shared resources.
+ * This indicates that dom0 has made the transition
+ * Closing->Closed or that dom0 was in Closed or Init and no
+ * resources were mapped.
+ */
+ wait_event(vnic->state_wait_queue,
+ !vnic->domU_state_is_setup);
+
+ /*
+ * Now we don't need this watch anymore it is safe to remove
+ * it (and so synchronise with it completing if outstanding)
+ */
+ DPRINTK("%s: unregistering xenbus accel watch\n",
+ __FUNCTION__);
+ unregister_xenbus_watch(&vnic->backend_accel_watch);
+ kfree(vnic->backend_accel_watch.node);
+
+ netfront_accel_vnic_dtor(vnic);
+
+ DPRINTK("%s done %s\n", __FUNCTION__, dev->nodename);
+
+ return 0;
+}
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author slp
+ * \brief Falcon specific definitions
+ * \date 2004/08
+ */
+
+#ifndef __EF_VI_FALCON_H__
+#define __EF_VI_FALCON_H__
+
+#define EFHW_4K 0x00001000u
+#define EFHW_8K 0x00002000u
+
+/* include the autogenerated register definitions */
+
+#include "ef_vi_falcon_core.h"
+#include "ef_vi_falcon_desc.h"
+#include "ef_vi_falcon_event.h"
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Helpers to turn bit shifts into dword shifts and check that the bit fields
+ * haven't overflown the dword etc. Aim is to preserve consistency with the
+ * autogenerated headers - once stable we could hard code.
+ *
+ *---------------------------------------------------------------------------*/
+
+/* mask constructors */
+#define __FALCON_MASK(WIDTH,T) ((((T)1) << (WIDTH)) - 1)
+#define __EFVI_MASK32(WIDTH) __FALCON_MASK((WIDTH),uint32_t)
+#define __EFVI_MASK64(WIDTH) __FALCON_MASK((WIDTH),uint64_t)
+
+#define __EFVI_FALCON_MASKFIELD32(LBN, WIDTH) ((uint32_t) \
+ (__EFVI_MASK32(WIDTH) << (LBN)))
+
+/* constructors for fields which span the first and second dwords */
+#define __LW(LBN) (32 - LBN)
+#define LOW(v, LBN, WIDTH) ((uint32_t) \
+ (((v) & __EFVI_MASK64(__LW((LBN)))) << (LBN)))
+#define HIGH(v, LBN, WIDTH) ((uint32_t)(((v) >> __LW((LBN))) & \
+ __EFVI_MASK64((WIDTH - __LW((LBN))))))
+/* constructors for fields within the second dword */
+#define __DW2(LBN) ((LBN) - 32)
+
+/* constructors for fields which span the second and third dwords */
+#define __LW2(LBN) (64 - LBN)
+#define LOW2(v, LBN, WIDTH) ((uint32_t) \
+ (((v) & __EFVI_MASK64(__LW2((LBN)))) << ((LBN) - 32)))
+#define HIGH2(v, LBN, WIDTH) ((uint32_t) \
+ (((v) >> __LW2((LBN))) & __EFVI_MASK64((WIDTH - __LW2((LBN))))))
+
+/* constructors for fields within the third dword */
+#define __DW3(LBN) ((LBN) - 64)
+
+
+/* constructors for fields which span the third and fourth dwords */
+#define __LW3(LBN) (96 - LBN)
+#define LOW3(v, LBN, WIDTH) ((uint32_t) \
+ (((v) & __EFVI_MASK64(__LW3((LBN)))) << ((LBN) - 64)))
+#define HIGH3(v, LBN, WIDTH) ((unit32_t) \
+ (((v) >> __LW3((LBN))) & __EFVI_MASK64((WIDTH - __LW3((LBN))))))
+
+/* constructors for fields within the fourth dword */
+#define __DW4(LBN) ((LBN) - 96)
+
+/* checks that the autogenerated headers our consistent with our model */
+#define WIDTHCHCK(a, b) ef_assert((a) == (b))
+#define RANGECHCK(v, WIDTH) \
+ ef_assert(((uint64_t)(v) & ~(__EFVI_MASK64((WIDTH)))) == 0)
+
+/* fields within the first dword */
+#define DWCHCK(LBN, WIDTH) ef_assert(((LBN) >= 0) &&(((LBN)+(WIDTH)) <= 32))
+
+/* fields which span the first and second dwords */
+#define LWCHK(LBN, WIDTH) ef_assert(WIDTH >= __LW(LBN))
+
+/*----------------------------------------------------------------------------
+ *
+ * Buffer virtual addresses (4K buffers)
+ *
+ *---------------------------------------------------------------------------*/
+
+/* Form a buffer virtual address from buffer ID and offset. If the offset
+** is larger than the buffer size, then the buffer indexed will be
+** calculated appropriately. It is the responsibility of the caller to
+** ensure that they have valid buffers programmed at that address.
+*/
+#define EFVI_FALCON_VADDR_4K_S (12)
+#define EFVI_FALCON_VADDR_M 0xfffff /* post shift mask */
+
+
+#define EFVI_FALCON_BUFFER_4K_ADDR(id,off) \
+ (((id) << EFVI_FALCON_VADDR_4K_S) + (off))
+
+#define EFVI_FALCON_BUFFER_4K_PAGE(vaddr) \
+ (((vaddr) >> EFVI_FALCON_VADDR_4K_S) & EFVI_FALCON_VADDR_M)
+
+#define EFVI_FALCON_BUFFER_4K_OFF(vaddr) \
+ ((vaddr) & __EFVI_MASK32(EFVI_FALCON_VADDR_4K_S))
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Masks
+ *
+ *---------------------------------------------------------------------------*/
+
+#define EFVI_FALCON_CLOCK_ASIC_HZ (125000)
+#define EFVI_FALCON_CLOCK_FPGA_HZ (62500)
+#define EFVI_FALCON_CLOCK_HZ EFVI_FALCON_CLOCK_ASIC_HZ
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Timers
+ *
+ *---------------------------------------------------------------------------*/
+
+/* Event-Queue Timer granularity - measured in us
+ Given by: 4096 * 3 cycle * clock period */
+
+#define EFVI_FALCON_EVQTIMER_PERIOD_US ((4096 * 3 * 1000) / EFVI_FALCON_CLOCK_HZ)
+
+/* mode bits */
+#define EFVI_FALCON_TIMER_MODE_DIS 0 /* disabled */
+#define EFVI_FALCON_TIMER_MODE_RUN 1 /* started counting right away */
+#define EFVI_FALCON_TIMER_MODE_HOLD 2 /* trigger mode (user queues) */
+
+#define EFVI_FALCON_EVQTIMER_HOLD (EFVI_FALCON_TIMER_MODE_HOLD << TIMER_MODE_LBN)
+#define EFVI_FALCON_EVQTIMER_RUN (EFVI_FALCON_TIMER_MODE_RUN << TIMER_MODE_LBN)
+#define EFVI_FALCON_EVQTIMER_DISABLE (EFVI_FALCON_TIMER_MODE_DIS << TIMER_MODE_LBN)
+
+
+/* ---- efhw_event_t helpers --- */
+
+#define EFVI_FALCON_EVENT_CODE(evp) \
+ ((evp)->u64 & EFVI_FALCON_EVENT_CODE_MASK)
+
+#define EFVI_FALCON_EVENT_SW_DATA_MASK 0x0000ffff
+
+#define __EFVI_FALCON_OPEN_MASK(WIDTH) ((((uint64_t)1) << (WIDTH)) - 1)
+
+#define EFVI_FALCON_EVENT_CODE_MASK \
+ (__EFVI_FALCON_OPEN_MASK(EV_CODE_WIDTH) << EV_CODE_LBN)
+
+
+#endif /* __EF_VI_FALCON_H__ */
--- /dev/null
+
+#define EFVI_FALCON_EXTENDED_P_BAR 1
+
+//////////////---- Bus Interface Unit Registers C Header ----//////////////
+#define IOM_IND_ADR_REG_OFST 0x0 // IO-mapped indirect access address register
+ #define IOM_AUTO_ADR_INC_EN_LBN 16
+ #define IOM_AUTO_ADR_INC_EN_WIDTH 1
+ #define IOM_IND_ADR_LBN 0
+ #define IOM_IND_ADR_WIDTH 16
+#define IOM_IND_DAT_REG_OFST 0x4 // IO-mapped indirect access data register
+ #define IOM_IND_DAT_LBN 0
+ #define IOM_IND_DAT_WIDTH 32
+#define ADR_REGION_REG_KER_OFST 0x0 // Address region register
+#define ADR_REGION_REG_OFST 0x0 // Address region register
+ #define ADR_REGION3_LBN 96
+ #define ADR_REGION3_WIDTH 18
+ #define ADR_REGION2_LBN 64
+ #define ADR_REGION2_WIDTH 18
+ #define ADR_REGION1_LBN 32
+ #define ADR_REGION1_WIDTH 18
+ #define ADR_REGION0_LBN 0
+ #define ADR_REGION0_WIDTH 18
+#define INT_EN_REG_KER_OFST 0x10 // Kernel driver Interrupt enable register
+ #define KER_INT_CHAR_LBN 4
+ #define KER_INT_CHAR_WIDTH 1
+ #define KER_INT_KER_LBN 3
+ #define KER_INT_KER_WIDTH 1
+ #define ILL_ADR_ERR_INT_EN_KER_LBN 2
+ #define ILL_ADR_ERR_INT_EN_KER_WIDTH 1
+ #define SRM_PERR_INT_EN_KER_LBN 1
+ #define SRM_PERR_INT_EN_KER_WIDTH 1
+ #define DRV_INT_EN_KER_LBN 0
+ #define DRV_INT_EN_KER_WIDTH 1
+#define INT_EN_REG_CHAR_OFST 0x20 // Char Driver interrupt enable register
+ #define CHAR_INT_CHAR_LBN 4
+ #define CHAR_INT_CHAR_WIDTH 1
+ #define CHAR_INT_KER_LBN 3
+ #define CHAR_INT_KER_WIDTH 1
+ #define ILL_ADR_ERR_INT_EN_CHAR_LBN 2
+ #define ILL_ADR_ERR_INT_EN_CHAR_WIDTH 1
+ #define SRM_PERR_INT_EN_CHAR_LBN 1
+ #define SRM_PERR_INT_EN_CHAR_WIDTH 1
+ #define DRV_INT_EN_CHAR_LBN 0
+ #define DRV_INT_EN_CHAR_WIDTH 1
+#define INT_ADR_REG_KER_OFST 0x30 // Interrupt host address for Kernel driver
+ #define INT_ADR_KER_LBN 0
+ #define INT_ADR_KER_WIDTH 64
+ #define DRV_INT_KER_LBN 32
+ #define DRV_INT_KER_WIDTH 1
+ #define EV_FF_HALF_INT_KER_LBN 3
+ #define EV_FF_HALF_INT_KER_WIDTH 1
+ #define EV_FF_FULL_INT_KER_LBN 2
+ #define EV_FF_FULL_INT_KER_WIDTH 1
+ #define ILL_ADR_ERR_INT_KER_LBN 1
+ #define ILL_ADR_ERR_INT_KER_WIDTH 1
+ #define SRAM_PERR_INT_KER_LBN 0
+ #define SRAM_PERR_INT_KER_WIDTH 1
+#define INT_ADR_REG_CHAR_OFST 0x40 // Interrupt host address for Char driver
+ #define INT_ADR_CHAR_LBN 0
+ #define INT_ADR_CHAR_WIDTH 64
+ #define DRV_INT_CHAR_LBN 32
+ #define DRV_INT_CHAR_WIDTH 1
+ #define EV_FF_HALF_INT_CHAR_LBN 3
+ #define EV_FF_HALF_INT_CHAR_WIDTH 1
+ #define EV_FF_FULL_INT_CHAR_LBN 2
+ #define EV_FF_FULL_INT_CHAR_WIDTH 1
+ #define ILL_ADR_ERR_INT_CHAR_LBN 1
+ #define ILL_ADR_ERR_INT_CHAR_WIDTH 1
+ #define SRAM_PERR_INT_CHAR_LBN 0
+ #define SRAM_PERR_INT_CHAR_WIDTH 1
+#define INT_ISR0_B0_OFST 0x90 // B0 only
+#define INT_ISR1_B0_OFST 0xA0
+#define INT_ACK_REG_KER_A1_OFST 0x50 // Kernel interrupt acknowledge register
+ #define RESERVED_LBN 0
+ #define RESERVED_WIDTH 32
+#define INT_ACK_REG_CHAR_A1_OFST 0x60 // CHAR interrupt acknowledge register
+ #define RESERVED_LBN 0
+ #define RESERVED_WIDTH 32
+//////////////---- Global CSR Registers C Header ----//////////////
+#define STRAP_REG_KER_OFST 0x200 // ASIC strap status register
+#define STRAP_REG_OFST 0x200 // ASIC strap status register
+ #define ONCHIP_SRAM_LBN 16
+ #define ONCHIP_SRAM_WIDTH 0
+ #define STRAP_ISCSI_EN_LBN 3
+ #define STRAP_ISCSI_EN_WIDTH 1
+ #define STRAP_PINS_LBN 0
+ #define STRAP_PINS_WIDTH 3
+#define GPIO_CTL_REG_KER_OFST 0x210 // GPIO control register
+#define GPIO_CTL_REG_OFST 0x210 // GPIO control register
+ #define GPIO_OEN_LBN 24
+ #define GPIO_OEN_WIDTH 4
+ #define GPIO_OUT_LBN 16
+ #define GPIO_OUT_WIDTH 4
+ #define GPIO_IN_LBN 8
+ #define GPIO_IN_WIDTH 4
+ #define GPIO_PWRUP_VALUE_LBN 0
+ #define GPIO_PWRUP_VALUE_WIDTH 4
+#define GLB_CTL_REG_KER_OFST 0x220 // Global control register
+#define GLB_CTL_REG_OFST 0x220 // Global control register
+ #define SWRST_LBN 0
+ #define SWRST_WIDTH 1
+#define FATAL_INTR_REG_KER_OFST 0x230 // Fatal interrupt register for Kernel
+ #define PCI_BUSERR_INT_KER_EN_LBN 43
+ #define PCI_BUSERR_INT_KER_EN_WIDTH 1
+ #define SRAM_OOB_INT_KER_EN_LBN 42
+ #define SRAM_OOB_INT_KER_EN_WIDTH 1
+ #define BUFID_OOB_INT_KER_EN_LBN 41
+ #define BUFID_OOB_INT_KER_EN_WIDTH 1
+ #define MEM_PERR_INT_KER_EN_LBN 40
+ #define MEM_PERR_INT_KER_EN_WIDTH 1
+ #define RBUF_OWN_INT_KER_EN_LBN 39
+ #define RBUF_OWN_INT_KER_EN_WIDTH 1
+ #define TBUF_OWN_INT_KER_EN_LBN 38
+ #define TBUF_OWN_INT_KER_EN_WIDTH 1
+ #define RDESCQ_OWN_INT_KER_EN_LBN 37
+ #define RDESCQ_OWN_INT_KER_EN_WIDTH 1
+ #define TDESCQ_OWN_INT_KER_EN_LBN 36
+ #define TDESCQ_OWN_INT_KER_EN_WIDTH 1
+ #define EVQ_OWN_INT_KER_EN_LBN 35
+ #define EVQ_OWN_INT_KER_EN_WIDTH 1
+ #define EVFF_OFLO_INT_KER_EN_LBN 34
+ #define EVFF_OFLO_INT_KER_EN_WIDTH 1
+ #define ILL_ADR_INT_KER_EN_LBN 33
+ #define ILL_ADR_INT_KER_EN_WIDTH 1
+ #define SRM_PERR_INT_KER_EN_LBN 32
+ #define SRM_PERR_INT_KER_EN_WIDTH 1
+ #define PCI_BUSERR_INT_KER_LBN 11
+ #define PCI_BUSERR_INT_KER_WIDTH 1
+ #define SRAM_OOB_INT_KER_LBN 10
+ #define SRAM_OOB_INT_KER_WIDTH 1
+ #define BUFID_OOB_INT_KER_LBN 9
+ #define BUFID_OOB_INT_KER_WIDTH 1
+ #define MEM_PERR_INT_KER_LBN 8
+ #define MEM_PERR_INT_KER_WIDTH 1
+ #define RBUF_OWN_INT_KER_LBN 7
+ #define RBUF_OWN_INT_KER_WIDTH 1
+ #define TBUF_OWN_INT_KER_LBN 6
+ #define TBUF_OWN_INT_KER_WIDTH 1
+ #define RDESCQ_OWN_INT_KER_LBN 5
+ #define RDESCQ_OWN_INT_KER_WIDTH 1
+ #define TDESCQ_OWN_INT_KER_LBN 4
+ #define TDESCQ_OWN_INT_KER_WIDTH 1
+ #define EVQ_OWN_INT_KER_LBN 3
+ #define EVQ_OWN_INT_KER_WIDTH 1
+ #define EVFF_OFLO_INT_KER_LBN 2
+ #define EVFF_OFLO_INT_KER_WIDTH 1
+ #define ILL_ADR_INT_KER_LBN 1
+ #define ILL_ADR_INT_KER_WIDTH 1
+ #define SRM_PERR_INT_KER_LBN 0
+ #define SRM_PERR_INT_KER_WIDTH 1
+#define FATAL_INTR_REG_OFST 0x240 // Fatal interrupt register for Char
+ #define PCI_BUSERR_INT_CHAR_EN_LBN 43
+ #define PCI_BUSERR_INT_CHAR_EN_WIDTH 1
+ #define SRAM_OOB_INT_CHAR_EN_LBN 42
+ #define SRAM_OOB_INT_CHAR_EN_WIDTH 1
+ #define BUFID_OOB_INT_CHAR_EN_LBN 41
+ #define BUFID_OOB_INT_CHAR_EN_WIDTH 1
+ #define MEM_PERR_INT_CHAR_EN_LBN 40
+ #define MEM_PERR_INT_CHAR_EN_WIDTH 1
+ #define RBUF_OWN_INT_CHAR_EN_LBN 39
+ #define RBUF_OWN_INT_CHAR_EN_WIDTH 1
+ #define TBUF_OWN_INT_CHAR_EN_LBN 38
+ #define TBUF_OWN_INT_CHAR_EN_WIDTH 1
+ #define RDESCQ_OWN_INT_CHAR_EN_LBN 37
+ #define RDESCQ_OWN_INT_CHAR_EN_WIDTH 1
+ #define TDESCQ_OWN_INT_CHAR_EN_LBN 36
+ #define TDESCQ_OWN_INT_CHAR_EN_WIDTH 1
+ #define EVQ_OWN_INT_CHAR_EN_LBN 35
+ #define EVQ_OWN_INT_CHAR_EN_WIDTH 1
+ #define EVFF_OFLO_INT_CHAR_EN_LBN 34
+ #define EVFF_OFLO_INT_CHAR_EN_WIDTH 1
+ #define ILL_ADR_INT_CHAR_EN_LBN 33
+ #define ILL_ADR_INT_CHAR_EN_WIDTH 1
+ #define SRM_PERR_INT_CHAR_EN_LBN 32
+ #define SRM_PERR_INT_CHAR_EN_WIDTH 1
+ #define FATAL_INTR_REG_EN_BITS 0xffffffffffffffffULL
+ #define PCI_BUSERR_INT_CHAR_LBN 11
+ #define PCI_BUSERR_INT_CHAR_WIDTH 1
+ #define SRAM_OOB_INT_CHAR_LBN 10
+ #define SRAM_OOB_INT_CHAR_WIDTH 1
+ #define BUFID_OOB_INT_CHAR_LBN 9
+ #define BUFID_OOB_INT_CHAR_WIDTH 1
+ #define MEM_PERR_INT_CHAR_LBN 8
+ #define MEM_PERR_INT_CHAR_WIDTH 1
+ #define RBUF_OWN_INT_CHAR_LBN 7
+ #define RBUF_OWN_INT_CHAR_WIDTH 1
+ #define TBUF_OWN_INT_CHAR_LBN 6
+ #define TBUF_OWN_INT_CHAR_WIDTH 1
+ #define RDESCQ_OWN_INT_CHAR_LBN 5
+ #define RDESCQ_OWN_INT_CHAR_WIDTH 1
+ #define TDESCQ_OWN_INT_CHAR_LBN 4
+ #define TDESCQ_OWN_INT_CHAR_WIDTH 1
+ #define EVQ_OWN_INT_CHAR_LBN 3
+ #define EVQ_OWN_INT_CHAR_WIDTH 1
+ #define EVFF_OFLO_INT_CHAR_LBN 2
+ #define EVFF_OFLO_INT_CHAR_WIDTH 1
+ #define ILL_ADR_INT_CHAR_LBN 1
+ #define ILL_ADR_INT_CHAR_WIDTH 1
+ #define SRM_PERR_INT_CHAR_LBN 0
+ #define SRM_PERR_INT_CHAR_WIDTH 1
+#define DP_CTRL_REG_OFST 0x250 // Datapath control register
+ #define FLS_EVQ_ID_LBN 0
+ #define FLS_EVQ_ID_WIDTH 12
+#define MEM_STAT_REG_KER_OFST 0x260 // Memory status register
+#define MEM_STAT_REG_OFST 0x260 // Memory status register
+ #define MEM_PERR_VEC_LBN 53
+ #define MEM_PERR_VEC_WIDTH 38
+ #define MBIST_CORR_LBN 38
+ #define MBIST_CORR_WIDTH 15
+ #define MBIST_ERR_LBN 0
+ #define MBIST_ERR_WIDTH 38
+#define DEBUG_REG_KER_OFST 0x270 // Debug register
+#define DEBUG_REG_OFST 0x270 // Debug register
+ #define DEBUG_BLK_SEL2_LBN 47
+ #define DEBUG_BLK_SEL2_WIDTH 3
+ #define DEBUG_BLK_SEL1_LBN 44
+ #define DEBUG_BLK_SEL1_WIDTH 3
+ #define DEBUG_BLK_SEL0_LBN 41
+ #define DEBUG_BLK_SEL0_WIDTH 3
+ #define MISC_DEBUG_ADDR_LBN 36
+ #define MISC_DEBUG_ADDR_WIDTH 5
+ #define SERDES_DEBUG_ADDR_LBN 31
+ #define SERDES_DEBUG_ADDR_WIDTH 5
+ #define EM_DEBUG_ADDR_LBN 26
+ #define EM_DEBUG_ADDR_WIDTH 5
+ #define SR_DEBUG_ADDR_LBN 21
+ #define SR_DEBUG_ADDR_WIDTH 5
+ #define EV_DEBUG_ADDR_LBN 16
+ #define EV_DEBUG_ADDR_WIDTH 5
+ #define RX_DEBUG_ADDR_LBN 11
+ #define RX_DEBUG_ADDR_WIDTH 5
+ #define TX_DEBUG_ADDR_LBN 6
+ #define TX_DEBUG_ADDR_WIDTH 5
+ #define BIU_DEBUG_ADDR_LBN 1
+ #define BIU_DEBUG_ADDR_WIDTH 5
+ #define DEBUG_EN_LBN 0
+ #define DEBUG_EN_WIDTH 1
+#define DRIVER_REG0_KER_OFST 0x280 // Driver scratch register 0
+#define DRIVER_REG0_OFST 0x280 // Driver scratch register 0
+ #define DRIVER_DW0_LBN 0
+ #define DRIVER_DW0_WIDTH 32
+#define DRIVER_REG1_KER_OFST 0x290 // Driver scratch register 1
+#define DRIVER_REG1_OFST 0x290 // Driver scratch register 1
+ #define DRIVER_DW1_LBN 0
+ #define DRIVER_DW1_WIDTH 32
+#define DRIVER_REG2_KER_OFST 0x2A0 // Driver scratch register 2
+#define DRIVER_REG2_OFST 0x2A0 // Driver scratch register 2
+ #define DRIVER_DW2_LBN 0
+ #define DRIVER_DW2_WIDTH 32
+#define DRIVER_REG3_KER_OFST 0x2B0 // Driver scratch register 3
+#define DRIVER_REG3_OFST 0x2B0 // Driver scratch register 3
+ #define DRIVER_DW3_LBN 0
+ #define DRIVER_DW3_WIDTH 32
+#define DRIVER_REG4_KER_OFST 0x2C0 // Driver scratch register 4
+#define DRIVER_REG4_OFST 0x2C0 // Driver scratch register 4
+ #define DRIVER_DW4_LBN 0
+ #define DRIVER_DW4_WIDTH 32
+#define DRIVER_REG5_KER_OFST 0x2D0 // Driver scratch register 5
+#define DRIVER_REG5_OFST 0x2D0 // Driver scratch register 5
+ #define DRIVER_DW5_LBN 0
+ #define DRIVER_DW5_WIDTH 32
+#define DRIVER_REG6_KER_OFST 0x2E0 // Driver scratch register 6
+#define DRIVER_REG6_OFST 0x2E0 // Driver scratch register 6
+ #define DRIVER_DW6_LBN 0
+ #define DRIVER_DW6_WIDTH 32
+#define DRIVER_REG7_KER_OFST 0x2F0 // Driver scratch register 7
+#define DRIVER_REG7_OFST 0x2F0 // Driver scratch register 7
+ #define DRIVER_DW7_LBN 0
+ #define DRIVER_DW7_WIDTH 32
+#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
+#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
+ #define ALTERA_BUILD_VER_LBN 0
+ #define ALTERA_BUILD_VER_WIDTH 32
+
+/* so called CSR spare register
+ - contains separate parity enable bits for the various internal memory blocks */
+#define MEM_PARITY_ERR_EN_REG_KER 0x310
+#define MEM_PARITY_ALL_BLOCKS_EN_LBN 64
+#define MEM_PARITY_ALL_BLOCKS_EN_WIDTH 38
+#define MEM_PARITY_TX_DATA_EN_LBN 72
+#define MEM_PARITY_TX_DATA_EN_WIDTH 2
+
+//////////////---- Event & Timer Module Registers C Header ----//////////////
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define EVQ_RPTR_REG_KER_OFST 0x11B00 // Event queue read pointer register
+#else
+#define EVQ_RPTR_REG_KER_OFST 0x1B00 // Event queue read pointer register
+#endif
+
+#define EVQ_RPTR_REG_OFST 0xFA0000 // Event queue read pointer register array.
+ #define EVQ_RPTR_LBN 0
+ #define EVQ_RPTR_WIDTH 15
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define EVQ_PTR_TBL_KER_OFST 0x11A00 // Event queue pointer table for kernel access
+#else
+#define EVQ_PTR_TBL_KER_OFST 0x1A00 // Event queue pointer table for kernel access
+#endif
+
+#define EVQ_PTR_TBL_CHAR_OFST 0xF60000 // Event queue pointer table for char direct access
+ #define EVQ_WKUP_OR_INT_EN_LBN 39
+ #define EVQ_WKUP_OR_INT_EN_WIDTH 1
+ #define EVQ_NXT_WPTR_LBN 24
+ #define EVQ_NXT_WPTR_WIDTH 15
+ #define EVQ_EN_LBN 23
+ #define EVQ_EN_WIDTH 1
+ #define EVQ_SIZE_LBN 20
+ #define EVQ_SIZE_WIDTH 3
+ #define EVQ_BUF_BASE_ID_LBN 0
+ #define EVQ_BUF_BASE_ID_WIDTH 20
+#define TIMER_CMD_REG_KER_OFST 0x420 // Timer table for kernel access. Page-mapped
+#define TIMER_CMD_REG_PAGE4_OFST 0x8420 // Timer table for user-level access. Page-mapped. For lowest 1K queues.
+#define TIMER_CMD_REG_PAGE123K_OFST 0x1000420 // Timer table for user-level access. Page-mapped. For upper 3K queues.
+#define TIMER_TBL_OFST 0xF70000 // Timer table for char driver direct access
+ #define TIMER_MODE_LBN 12
+ #define TIMER_MODE_WIDTH 2
+ #define TIMER_VAL_LBN 0
+ #define TIMER_VAL_WIDTH 12
+ #define TIMER_MODE_INT_HLDOFF 2
+ #define EVQ_BUF_SIZE_LBN 0
+ #define EVQ_BUF_SIZE_WIDTH 1
+#define DRV_EV_REG_KER_OFST 0x440 // Driver generated event register
+#define DRV_EV_REG_OFST 0x440 // Driver generated event register
+ #define DRV_EV_QID_LBN 64
+ #define DRV_EV_QID_WIDTH 12
+ #define DRV_EV_DATA_LBN 0
+ #define DRV_EV_DATA_WIDTH 64
+#define EVQ_CTL_REG_KER_OFST 0x450 // Event queue control register
+#define EVQ_CTL_REG_OFST 0x450 // Event queue control register
+ #define RX_EVQ_WAKEUP_MASK_B0_LBN 15
+ #define RX_EVQ_WAKEUP_MASK_B0_WIDTH 6
+ #define EVQ_OWNERR_CTL_LBN 14
+ #define EVQ_OWNERR_CTL_WIDTH 1
+ #define EVQ_FIFO_AF_TH_LBN 8
+ #define EVQ_FIFO_AF_TH_WIDTH 6
+ #define EVQ_FIFO_NOTAF_TH_LBN 0
+ #define EVQ_FIFO_NOTAF_TH_WIDTH 6
+//////////////---- SRAM Module Registers C Header ----//////////////
+#define BUF_TBL_CFG_REG_KER_OFST 0x600 // Buffer table configuration register
+#define BUF_TBL_CFG_REG_OFST 0x600 // Buffer table configuration register
+ #define BUF_TBL_MODE_LBN 3
+ #define BUF_TBL_MODE_WIDTH 1
+#define SRM_RX_DC_CFG_REG_KER_OFST 0x610 // SRAM receive descriptor cache configuration register
+#define SRM_RX_DC_CFG_REG_OFST 0x610 // SRAM receive descriptor cache configuration register
+ #define SRM_RX_DC_BASE_ADR_LBN 0
+ #define SRM_RX_DC_BASE_ADR_WIDTH 21
+#define SRM_TX_DC_CFG_REG_KER_OFST 0x620 // SRAM transmit descriptor cache configuration register
+#define SRM_TX_DC_CFG_REG_OFST 0x620 // SRAM transmit descriptor cache configuration register
+ #define SRM_TX_DC_BASE_ADR_LBN 0
+ #define SRM_TX_DC_BASE_ADR_WIDTH 21
+#define SRM_CFG_REG_KER_OFST 0x630 // SRAM configuration register
+#define SRM_CFG_REG_OFST 0x630 // SRAM configuration register
+ #define SRAM_OOB_ADR_INTEN_LBN 5
+ #define SRAM_OOB_ADR_INTEN_WIDTH 1
+ #define SRAM_OOB_BUF_INTEN_LBN 4
+ #define SRAM_OOB_BUF_INTEN_WIDTH 1
+ #define SRAM_BT_INIT_EN_LBN 3
+ #define SRAM_BT_INIT_EN_WIDTH 1
+ #define SRM_NUM_BANK_LBN 2
+ #define SRM_NUM_BANK_WIDTH 1
+ #define SRM_BANK_SIZE_LBN 0
+ #define SRM_BANK_SIZE_WIDTH 2
+#define BUF_TBL_UPD_REG_KER_OFST 0x650 // Buffer table update register
+#define BUF_TBL_UPD_REG_OFST 0x650 // Buffer table update register
+ #define BUF_UPD_CMD_LBN 63
+ #define BUF_UPD_CMD_WIDTH 1
+ #define BUF_CLR_CMD_LBN 62
+ #define BUF_CLR_CMD_WIDTH 1
+ #define BUF_CLR_END_ID_LBN 32
+ #define BUF_CLR_END_ID_WIDTH 20
+ #define BUF_CLR_START_ID_LBN 0
+ #define BUF_CLR_START_ID_WIDTH 20
+#define SRM_UPD_EVQ_REG_KER_OFST 0x660 // Buffer table update register
+#define SRM_UPD_EVQ_REG_OFST 0x660 // Buffer table update register
+ #define SRM_UPD_EVQ_ID_LBN 0
+ #define SRM_UPD_EVQ_ID_WIDTH 12
+#define SRAM_PARITY_REG_KER_OFST 0x670 // SRAM parity register.
+#define SRAM_PARITY_REG_OFST 0x670 // SRAM parity register.
+ #define FORCE_SRAM_PERR_LBN 0
+ #define FORCE_SRAM_PERR_WIDTH 1
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define BUF_HALF_TBL_KER_OFST 0x18000 // Buffer table in half buffer table mode direct access by kernel driver
+#else
+#define BUF_HALF_TBL_KER_OFST 0x8000 // Buffer table in half buffer table mode direct access by kernel driver
+#endif
+
+
+#define BUF_HALF_TBL_OFST 0x800000 // Buffer table in half buffer table mode direct access by char driver
+ #define BUF_ADR_HBUF_ODD_LBN 44
+ #define BUF_ADR_HBUF_ODD_WIDTH 20
+ #define BUF_OWNER_ID_HBUF_ODD_LBN 32
+ #define BUF_OWNER_ID_HBUF_ODD_WIDTH 12
+ #define BUF_ADR_HBUF_EVEN_LBN 12
+ #define BUF_ADR_HBUF_EVEN_WIDTH 20
+ #define BUF_OWNER_ID_HBUF_EVEN_LBN 0
+ #define BUF_OWNER_ID_HBUF_EVEN_WIDTH 12
+
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define BUF_FULL_TBL_KER_OFST 0x18000 // Buffer table in full buffer table mode direct access by kernel driver
+#else
+#define BUF_FULL_TBL_KER_OFST 0x8000 // Buffer table in full buffer table mode direct access by kernel driver
+#endif
+
+
+
+
+#define BUF_FULL_TBL_OFST 0x800000 // Buffer table in full buffer table mode direct access by char driver
+ #define IP_DAT_BUF_SIZE_LBN 50
+ #define IP_DAT_BUF_SIZE_WIDTH 1
+ #define BUF_ADR_REGION_LBN 48
+ #define BUF_ADR_REGION_WIDTH 2
+ #define BUF_ADR_FBUF_LBN 14
+ #define BUF_ADR_FBUF_WIDTH 34
+ #define BUF_OWNER_ID_FBUF_LBN 0
+ #define BUF_OWNER_ID_FBUF_WIDTH 14
+#define SRM_DBG_REG_OFST 0x3000000 // SRAM debug access
+ #define SRM_DBG_LBN 0
+ #define SRM_DBG_WIDTH 64
+//////////////---- RX Datapath Registers C Header ----//////////////
+
+#define RX_CFG_REG_KER_OFST 0x800 // Receive configuration register
+#define RX_CFG_REG_OFST 0x800 // Receive configuration register
+
+#if !defined(FALCON_64K_RXFIFO) && !defined(FALCON_PRE_02020029)
+# if !defined(FALCON_128K_RXFIFO)
+# define FALCON_128K_RXFIFO
+# endif
+#endif
+
+#if defined(FALCON_128K_RXFIFO)
+
+/* new for B0 */
+ #define RX_TOEP_TCP_SUPPRESS_B0_LBN 48
+ #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+ #define RX_INGR_EN_B0_LBN 47
+ #define RX_INGR_EN_B0_WIDTH 1
+ #define RX_TOEP_IPV4_B0_LBN 46
+ #define RX_TOEP_IPV4_B0_WIDTH 1
+ #define RX_HASH_ALG_B0_LBN 45
+ #define RX_HASH_ALG_B0_WIDTH 1
+ #define RX_HASH_INSERT_HDR_B0_LBN 44
+ #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+ #define RX_DESC_PUSH_EN_B0_LBN 43
+ #define RX_DESC_PUSH_EN_B0_WIDTH 1
+ #define RX_RDW_PATCH_EN_LBN 42 /* Non head of line blocking */
+ #define RX_RDW_PATCH_EN_WIDTH 1
+ #define RX_PCI_BURST_SIZE_B0_LBN 39
+ #define RX_PCI_BURST_SIZE_B0_WIDTH 3
+ #define RX_OWNERR_CTL_B0_LBN 38
+ #define RX_OWNERR_CTL_B0_WIDTH 1
+ #define RX_XON_TX_TH_B0_LBN 33
+ #define RX_XON_TX_TH_B0_WIDTH 5
+ #define RX_XOFF_TX_TH_B0_LBN 28
+ #define RX_XOFF_TX_TH_B0_WIDTH 5
+ #define RX_USR_BUF_SIZE_B0_LBN 19
+ #define RX_USR_BUF_SIZE_B0_WIDTH 9
+ #define RX_XON_MAC_TH_B0_LBN 10
+ #define RX_XON_MAC_TH_B0_WIDTH 9
+ #define RX_XOFF_MAC_TH_B0_LBN 1
+ #define RX_XOFF_MAC_TH_B0_WIDTH 9
+ #define RX_XOFF_MAC_EN_B0_LBN 0
+ #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#elif !defined(FALCON_PRE_02020029)
+/* new for B0 */
+ #define RX_TOEP_TCP_SUPPRESS_B0_LBN 46
+ #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+ #define RX_INGR_EN_B0_LBN 45
+ #define RX_INGR_EN_B0_WIDTH 1
+ #define RX_TOEP_IPV4_B0_LBN 44
+ #define RX_TOEP_IPV4_B0_WIDTH 1
+ #define RX_HASH_ALG_B0_LBN 43
+ #define RX_HASH_ALG_B0_WIDTH 41
+ #define RX_HASH_INSERT_HDR_B0_LBN 42
+ #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+ #define RX_DESC_PUSH_EN_B0_LBN 41
+ #define RX_DESC_PUSH_EN_B0_WIDTH 1
+ #define RX_PCI_BURST_SIZE_B0_LBN 37
+ #define RX_PCI_BURST_SIZE_B0_WIDTH 3
+ #define RX_OWNERR_CTL_B0_LBN 36
+ #define RX_OWNERR_CTL_B0_WIDTH 1
+ #define RX_XON_TX_TH_B0_LBN 31
+ #define RX_XON_TX_TH_B0_WIDTH 5
+ #define RX_XOFF_TX_TH_B0_LBN 26
+ #define RX_XOFF_TX_TH_B0_WIDTH 5
+ #define RX_USR_BUF_SIZE_B0_LBN 17
+ #define RX_USR_BUF_SIZE_B0_WIDTH 9
+ #define RX_XON_MAC_TH_B0_LBN 9
+ #define RX_XON_MAC_TH_B0_WIDTH 8
+ #define RX_XOFF_MAC_TH_B0_LBN 1
+ #define RX_XOFF_MAC_TH_B0_WIDTH 8
+ #define RX_XOFF_MAC_EN_B0_LBN 0
+ #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#else
+/* new for B0 */
+ #define RX_TOEP_TCP_SUPPRESS_B0_LBN 44
+ #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+ #define RX_INGR_EN_B0_LBN 43
+ #define RX_INGR_EN_B0_WIDTH 1
+ #define RX_TOEP_IPV4_B0_LBN 42
+ #define RX_TOEP_IPV4_B0_WIDTH 1
+ #define RX_HASH_ALG_B0_LBN 41
+ #define RX_HASH_ALG_B0_WIDTH 41
+ #define RX_HASH_INSERT_HDR_B0_LBN 40
+ #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+ #define RX_DESC_PUSH_EN_B0_LBN 35
+ #define RX_DESC_PUSH_EN_B0_WIDTH 1
+ #define RX_PCI_BURST_SIZE_B0_LBN 35
+ #define RX_PCI_BURST_SIZE_B0_WIDTH 2
+ #define RX_OWNERR_CTL_B0_LBN 34
+ #define RX_OWNERR_CTL_B0_WIDTH 1
+ #define RX_XON_TX_TH_B0_LBN 29
+ #define RX_XON_TX_TH_B0_WIDTH 5
+ #define RX_XOFF_TX_TH_B0_LBN 24
+ #define RX_XOFF_TX_TH_B0_WIDTH 5
+ #define RX_USR_BUF_SIZE_B0_LBN 15
+ #define RX_USR_BUF_SIZE_B0_WIDTH 9
+ #define RX_XON_MAC_TH_B0_LBN 8
+ #define RX_XON_MAC_TH_B0_WIDTH 7
+ #define RX_XOFF_MAC_TH_B0_LBN 1
+ #define RX_XOFF_MAC_TH_B0_WIDTH 7
+ #define RX_XOFF_MAC_EN_B0_LBN 0
+ #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#endif
+
+/* A0/A1 */
+ #define RX_PUSH_EN_A1_LBN 35
+ #define RX_PUSH_EN_A1_WIDTH 1
+ #define RX_PCI_BURST_SIZE_A1_LBN 31
+ #define RX_PCI_BURST_SIZE_A1_WIDTH 3
+ #define RX_OWNERR_CTL_A1_LBN 30
+ #define RX_OWNERR_CTL_A1_WIDTH 1
+ #define RX_XON_TX_TH_A1_LBN 25
+ #define RX_XON_TX_TH_A1_WIDTH 5
+ #define RX_XOFF_TX_TH_A1_LBN 20
+ #define RX_XOFF_TX_TH_A1_WIDTH 5
+ #define RX_USR_BUF_SIZE_A1_LBN 11
+ #define RX_USR_BUF_SIZE_A1_WIDTH 9
+ #define RX_XON_MAC_TH_A1_LBN 6
+ #define RX_XON_MAC_TH_A1_WIDTH 5
+ #define RX_XOFF_MAC_TH_A1_LBN 1
+ #define RX_XOFF_MAC_TH_A1_WIDTH 5
+ #define RX_XOFF_MAC_EN_A1_LBN 0
+ #define RX_XOFF_MAC_EN_A1_WIDTH 1
+
+#define RX_FILTER_CTL_REG_OFST 0x810 // Receive filter control registers
+ #define SCATTER_ENBL_NO_MATCH_Q_B0_LBN 40
+ #define SCATTER_ENBL_NO_MATCH_Q_B0_WIDTH 1
+ #define UDP_FULL_SRCH_LIMIT_LBN 32
+ #define UDP_FULL_SRCH_LIMIT_WIDTH 8
+ #define NUM_KER_LBN 24
+ #define NUM_KER_WIDTH 2
+ #define UDP_WILD_SRCH_LIMIT_LBN 16
+ #define UDP_WILD_SRCH_LIMIT_WIDTH 8
+ #define TCP_WILD_SRCH_LIMIT_LBN 8
+ #define TCP_WILD_SRCH_LIMIT_WIDTH 8
+ #define TCP_FULL_SRCH_LIMIT_LBN 0
+ #define TCP_FULL_SRCH_LIMIT_WIDTH 8
+#define RX_FLUSH_DESCQ_REG_KER_OFST 0x820 // Receive flush descriptor queue register
+#define RX_FLUSH_DESCQ_REG_OFST 0x820 // Receive flush descriptor queue register
+ #define RX_FLUSH_DESCQ_CMD_LBN 24
+ #define RX_FLUSH_DESCQ_CMD_WIDTH 1
+ #define RX_FLUSH_EVQ_ID_LBN 12
+ #define RX_FLUSH_EVQ_ID_WIDTH 12
+ #define RX_FLUSH_DESCQ_LBN 0
+ #define RX_FLUSH_DESCQ_WIDTH 12
+#define RX_DESC_UPD_REG_KER_OFST 0x830 // Kernel receive descriptor update register. Page-mapped
+#define RX_DESC_UPD_REG_PAGE4_OFST 0x8830 // Char & user receive descriptor update register. Page-mapped. For lowest 1K queues.
+#define RX_DESC_UPD_REG_PAGE123K_OFST 0x1000830 // Char & user receive descriptor update register. Page-mapped. For upper 3K queues.
+ #define RX_DESC_WPTR_LBN 96
+ #define RX_DESC_WPTR_WIDTH 12
+ #define RX_DESC_PUSH_CMD_LBN 95
+ #define RX_DESC_PUSH_CMD_WIDTH 1
+ #define RX_DESC_LBN 0
+ #define RX_DESC_WIDTH 64
+ #define RX_KER_DESC_LBN 0
+ #define RX_KER_DESC_WIDTH 64
+ #define RX_USR_DESC_LBN 0
+ #define RX_USR_DESC_WIDTH 32
+#define RX_DC_CFG_REG_KER_OFST 0x840 // Receive descriptor cache configuration register
+#define RX_DC_CFG_REG_OFST 0x840 // Receive descriptor cache configuration register
+ #define RX_DC_SIZE_LBN 0
+ #define RX_DC_SIZE_WIDTH 2
+#define RX_DC_PF_WM_REG_KER_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
+#define RX_DC_PF_WM_REG_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
+ #define RX_DC_PF_LWM_LO_LBN 0
+ #define RX_DC_PF_LWM_LO_WIDTH 6
+
+#define RX_RSS_TKEY_B0_OFST 0x860 // RSS Toeplitz hash key (B0 only)
+
+#define RX_NODESC_DROP_REG 0x880
+ #define RX_NODESC_DROP_CNT_LBN 0
+ #define RX_NODESC_DROP_CNT_WIDTH 16
+
+#define XM_TX_CFG_REG_OFST 0x1230
+ #define XM_AUTO_PAD_LBN 5
+ #define XM_AUTO_PAD_WIDTH 1
+
+#define RX_FILTER_TBL0_OFST 0xF00000 // Receive filter table - even entries
+ #define RSS_EN_0_B0_LBN 110
+ #define RSS_EN_0_B0_WIDTH 1
+ #define SCATTER_EN_0_B0_LBN 109
+ #define SCATTER_EN_0_B0_WIDTH 1
+ #define TCP_UDP_0_LBN 108
+ #define TCP_UDP_0_WIDTH 1
+ #define RXQ_ID_0_LBN 96
+ #define RXQ_ID_0_WIDTH 12
+ #define DEST_IP_0_LBN 64
+ #define DEST_IP_0_WIDTH 32
+ #define DEST_PORT_TCP_0_LBN 48
+ #define DEST_PORT_TCP_0_WIDTH 16
+ #define SRC_IP_0_LBN 16
+ #define SRC_IP_0_WIDTH 32
+ #define SRC_TCP_DEST_UDP_0_LBN 0
+ #define SRC_TCP_DEST_UDP_0_WIDTH 16
+#define RX_FILTER_TBL1_OFST 0xF00010 // Receive filter table - odd entries
+ #define RSS_EN_1_B0_LBN 110
+ #define RSS_EN_1_B0_WIDTH 1
+ #define SCATTER_EN_1_B0_LBN 109
+ #define SCATTER_EN_1_B0_WIDTH 1
+ #define TCP_UDP_1_LBN 108
+ #define TCP_UDP_1_WIDTH 1
+ #define RXQ_ID_1_LBN 96
+ #define RXQ_ID_1_WIDTH 12
+ #define DEST_IP_1_LBN 64
+ #define DEST_IP_1_WIDTH 32
+ #define DEST_PORT_TCP_1_LBN 48
+ #define DEST_PORT_TCP_1_WIDTH 16
+ #define SRC_IP_1_LBN 16
+ #define SRC_IP_1_WIDTH 32
+ #define SRC_TCP_DEST_UDP_1_LBN 0
+ #define SRC_TCP_DEST_UDP_1_WIDTH 16
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define RX_DESC_PTR_TBL_KER_OFST 0x11800 // Receive descriptor pointer kernel access
+#else
+#define RX_DESC_PTR_TBL_KER_OFST 0x1800 // Receive descriptor pointer kernel access
+#endif
+
+
+#define RX_DESC_PTR_TBL_OFST 0xF40000 // Receive descriptor pointer table
+ #define RX_ISCSI_DDIG_EN_LBN 88
+ #define RX_ISCSI_DDIG_EN_WIDTH 1
+ #define RX_ISCSI_HDIG_EN_LBN 87
+ #define RX_ISCSI_HDIG_EN_WIDTH 1
+ #define RX_DESC_PREF_ACT_LBN 86
+ #define RX_DESC_PREF_ACT_WIDTH 1
+ #define RX_DC_HW_RPTR_LBN 80
+ #define RX_DC_HW_RPTR_WIDTH 6
+ #define RX_DESCQ_HW_RPTR_LBN 68
+ #define RX_DESCQ_HW_RPTR_WIDTH 12
+ #define RX_DESCQ_SW_WPTR_LBN 56
+ #define RX_DESCQ_SW_WPTR_WIDTH 12
+ #define RX_DESCQ_BUF_BASE_ID_LBN 36
+ #define RX_DESCQ_BUF_BASE_ID_WIDTH 20
+ #define RX_DESCQ_EVQ_ID_LBN 24
+ #define RX_DESCQ_EVQ_ID_WIDTH 12
+ #define RX_DESCQ_OWNER_ID_LBN 10
+ #define RX_DESCQ_OWNER_ID_WIDTH 14
+ #define RX_DESCQ_LABEL_LBN 5
+ #define RX_DESCQ_LABEL_WIDTH 5
+ #define RX_DESCQ_SIZE_LBN 3
+ #define RX_DESCQ_SIZE_WIDTH 2
+ #define RX_DESCQ_TYPE_LBN 2
+ #define RX_DESCQ_TYPE_WIDTH 1
+ #define RX_DESCQ_JUMBO_LBN 1
+ #define RX_DESCQ_JUMBO_WIDTH 1
+ #define RX_DESCQ_EN_LBN 0
+ #define RX_DESCQ_EN_WIDTH 1
+
+
+#define RX_RSS_INDIR_TBL_B0_OFST 0xFB0000 // RSS indirection table (B0 only)
+ #define RX_RSS_INDIR_ENT_B0_LBN 0
+ #define RX_RSS_INDIR_ENT_B0_WIDTH 6
+
+//////////////---- TX Datapath Registers C Header ----//////////////
+#define TX_FLUSH_DESCQ_REG_KER_OFST 0xA00 // Transmit flush descriptor queue register
+#define TX_FLUSH_DESCQ_REG_OFST 0xA00 // Transmit flush descriptor queue register
+ #define TX_FLUSH_DESCQ_CMD_LBN 12
+ #define TX_FLUSH_DESCQ_CMD_WIDTH 1
+ #define TX_FLUSH_DESCQ_LBN 0
+ #define TX_FLUSH_DESCQ_WIDTH 12
+#define TX_DESC_UPD_REG_KER_OFST 0xA10 // Kernel transmit descriptor update register. Page-mapped
+#define TX_DESC_UPD_REG_PAGE4_OFST 0x8A10 // Char & user transmit descriptor update register. Page-mapped
+#define TX_DESC_UPD_REG_PAGE123K_OFST 0x1000A10 // Char & user transmit descriptor update register. Page-mapped
+ #define TX_DESC_WPTR_LBN 96
+ #define TX_DESC_WPTR_WIDTH 12
+ #define TX_DESC_PUSH_CMD_LBN 95
+ #define TX_DESC_PUSH_CMD_WIDTH 1
+ #define TX_DESC_LBN 0
+ #define TX_DESC_WIDTH 95
+ #define TX_KER_DESC_LBN 0
+ #define TX_KER_DESC_WIDTH 64
+ #define TX_USR_DESC_LBN 0
+ #define TX_USR_DESC_WIDTH 64
+#define TX_DC_CFG_REG_KER_OFST 0xA20 // Transmit descriptor cache configuration register
+#define TX_DC_CFG_REG_OFST 0xA20 // Transmit descriptor cache configuration register
+ #define TX_DC_SIZE_LBN 0
+ #define TX_DC_SIZE_WIDTH 2
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define TX_DESC_PTR_TBL_KER_OFST 0x11900 // Transmit descriptor pointer.
+#else
+#define TX_DESC_PTR_TBL_KER_OFST 0x1900 // Transmit descriptor pointer.
+#endif
+
+
+#define TX_DESC_PTR_TBL_OFST 0xF50000 // Transmit descriptor pointer
+ #define TX_NON_IP_DROP_DIS_B0_LBN 91
+ #define TX_NON_IP_DROP_DIS_B0_WIDTH 1
+ #define TX_IP_CHKSM_DIS_B0_LBN 90
+ #define TX_IP_CHKSM_DIS_B0_WIDTH 1
+ #define TX_TCP_CHKSM_DIS_B0_LBN 89
+ #define TX_TCP_CHKSM_DIS_B0_WIDTH 1
+ #define TX_DESCQ_EN_LBN 88
+ #define TX_DESCQ_EN_WIDTH 1
+ #define TX_ISCSI_DDIG_EN_LBN 87
+ #define TX_ISCSI_DDIG_EN_WIDTH 1
+ #define TX_ISCSI_HDIG_EN_LBN 86
+ #define TX_ISCSI_HDIG_EN_WIDTH 1
+ #define TX_DC_HW_RPTR_LBN 80
+ #define TX_DC_HW_RPTR_WIDTH 6
+ #define TX_DESCQ_HW_RPTR_LBN 68
+ #define TX_DESCQ_HW_RPTR_WIDTH 12
+ #define TX_DESCQ_SW_WPTR_LBN 56
+ #define TX_DESCQ_SW_WPTR_WIDTH 12
+ #define TX_DESCQ_BUF_BASE_ID_LBN 36
+ #define TX_DESCQ_BUF_BASE_ID_WIDTH 20
+ #define TX_DESCQ_EVQ_ID_LBN 24
+ #define TX_DESCQ_EVQ_ID_WIDTH 12
+ #define TX_DESCQ_OWNER_ID_LBN 10
+ #define TX_DESCQ_OWNER_ID_WIDTH 14
+ #define TX_DESCQ_LABEL_LBN 5
+ #define TX_DESCQ_LABEL_WIDTH 5
+ #define TX_DESCQ_SIZE_LBN 3
+ #define TX_DESCQ_SIZE_WIDTH 2
+ #define TX_DESCQ_TYPE_LBN 1
+ #define TX_DESCQ_TYPE_WIDTH 2
+ #define TX_DESCQ_FLUSH_LBN 0
+ #define TX_DESCQ_FLUSH_WIDTH 1
+#define TX_CFG_REG_KER_OFST 0xA50 // Transmit configuration register
+#define TX_CFG_REG_OFST 0xA50 // Transmit configuration register
+ #define TX_IP_ID_P1_OFS_LBN 32
+ #define TX_IP_ID_P1_OFS_WIDTH 15
+ #define TX_IP_ID_P0_OFS_LBN 16
+ #define TX_IP_ID_P0_OFS_WIDTH 15
+ #define TX_TURBO_EN_LBN 3
+ #define TX_TURBO_EN_WIDTH 1
+ #define TX_OWNERR_CTL_LBN 2
+ #define TX_OWNERR_CTL_WIDTH 2
+ #define TX_NON_IP_DROP_DIS_LBN 1
+ #define TX_NON_IP_DROP_DIS_WIDTH 1
+ #define TX_IP_ID_REP_EN_LBN 0
+ #define TX_IP_ID_REP_EN_WIDTH 1
+#define TX_RESERVED_REG_KER_OFST 0xA80 // Transmit configuration register
+#define TX_RESERVED_REG_OFST 0xA80 // Transmit configuration register
+ #define TX_CSR_PUSH_EN_LBN 89
+ #define TX_CSR_PUSH_EN_WIDTH 1
+ #define TX_RX_SPACER_LBN 64
+ #define TX_RX_SPACER_WIDTH 8
+ #define TX_SW_EV_EN_LBN 59
+ #define TX_SW_EV_EN_WIDTH 1
+ #define TX_RX_SPACER_EN_LBN 57
+ #define TX_RX_SPACER_EN_WIDTH 1
+ #define TX_CSR_PREF_WD_TMR_LBN 24
+ #define TX_CSR_PREF_WD_TMR_WIDTH 16
+ #define TX_CSR_ONLY1TAG_LBN 21
+ #define TX_CSR_ONLY1TAG_WIDTH 1
+ #define TX_PREF_THRESHOLD_LBN 19
+ #define TX_PREF_THRESHOLD_WIDTH 2
+ #define TX_ONE_PKT_PER_Q_LBN 18
+ #define TX_ONE_PKT_PER_Q_WIDTH 1
+ #define TX_DIS_NON_IP_EV_LBN 17
+ #define TX_DIS_NON_IP_EV_WIDTH 1
+ #define TX_DMA_SPACER_LBN 8
+ #define TX_DMA_SPACER_WIDTH 8
+ #define TX_FLUSH_MIN_LEN_EN_B0_LBN 7
+ #define TX_FLUSH_MIN_LEN_EN_B0_WIDTH 1
+ #define TX_TCP_DIS_A1_LBN 7
+ #define TX_TCP_DIS_A1_WIDTH 1
+ #define TX_IP_DIS_A1_LBN 6
+ #define TX_IP_DIS_A1_WIDTH 1
+ #define TX_MAX_CPL_LBN 2
+ #define TX_MAX_CPL_WIDTH 2
+ #define TX_MAX_PREF_LBN 0
+ #define TX_MAX_PREF_WIDTH 2
+#define TX_VLAN_REG_OFST 0xAE0 // Transmit VLAN tag register
+ #define TX_VLAN_EN_LBN 127
+ #define TX_VLAN_EN_WIDTH 1
+ #define TX_VLAN7_PORT1_EN_LBN 125
+ #define TX_VLAN7_PORT1_EN_WIDTH 1
+ #define TX_VLAN7_PORT0_EN_LBN 124
+ #define TX_VLAN7_PORT0_EN_WIDTH 1
+ #define TX_VLAN7_LBN 112
+ #define TX_VLAN7_WIDTH 12
+ #define TX_VLAN6_PORT1_EN_LBN 109
+ #define TX_VLAN6_PORT1_EN_WIDTH 1
+ #define TX_VLAN6_PORT0_EN_LBN 108
+ #define TX_VLAN6_PORT0_EN_WIDTH 1
+ #define TX_VLAN6_LBN 96
+ #define TX_VLAN6_WIDTH 12
+ #define TX_VLAN5_PORT1_EN_LBN 93
+ #define TX_VLAN5_PORT1_EN_WIDTH 1
+ #define TX_VLAN5_PORT0_EN_LBN 92
+ #define TX_VLAN5_PORT0_EN_WIDTH 1
+ #define TX_VLAN5_LBN 80
+ #define TX_VLAN5_WIDTH 12
+ #define TX_VLAN4_PORT1_EN_LBN 77
+ #define TX_VLAN4_PORT1_EN_WIDTH 1
+ #define TX_VLAN4_PORT0_EN_LBN 76
+ #define TX_VLAN4_PORT0_EN_WIDTH 1
+ #define TX_VLAN4_LBN 64
+ #define TX_VLAN4_WIDTH 12
+ #define TX_VLAN3_PORT1_EN_LBN 61
+ #define TX_VLAN3_PORT1_EN_WIDTH 1
+ #define TX_VLAN3_PORT0_EN_LBN 60
+ #define TX_VLAN3_PORT0_EN_WIDTH 1
+ #define TX_VLAN3_LBN 48
+ #define TX_VLAN3_WIDTH 12
+ #define TX_VLAN2_PORT1_EN_LBN 45
+ #define TX_VLAN2_PORT1_EN_WIDTH 1
+ #define TX_VLAN2_PORT0_EN_LBN 44
+ #define TX_VLAN2_PORT0_EN_WIDTH 1
+ #define TX_VLAN2_LBN 32
+ #define TX_VLAN2_WIDTH 12
+ #define TX_VLAN1_PORT1_EN_LBN 29
+ #define TX_VLAN1_PORT1_EN_WIDTH 1
+ #define TX_VLAN1_PORT0_EN_LBN 28
+ #define TX_VLAN1_PORT0_EN_WIDTH 1
+ #define TX_VLAN1_LBN 16
+ #define TX_VLAN1_WIDTH 12
+ #define TX_VLAN0_PORT1_EN_LBN 13
+ #define TX_VLAN0_PORT1_EN_WIDTH 1
+ #define TX_VLAN0_PORT0_EN_LBN 12
+ #define TX_VLAN0_PORT0_EN_WIDTH 1
+ #define TX_VLAN0_LBN 0
+ #define TX_VLAN0_WIDTH 12
+#define TX_FIL_CTL_REG_OFST 0xAF0 // Transmit filter control register
+ #define TX_MADR1_FIL_EN_LBN 65
+ #define TX_MADR1_FIL_EN_WIDTH 1
+ #define TX_MADR0_FIL_EN_LBN 64
+ #define TX_MADR0_FIL_EN_WIDTH 1
+ #define TX_IPFIL31_PORT1_EN_LBN 63
+ #define TX_IPFIL31_PORT1_EN_WIDTH 1
+ #define TX_IPFIL31_PORT0_EN_LBN 62
+ #define TX_IPFIL31_PORT0_EN_WIDTH 1
+ #define TX_IPFIL30_PORT1_EN_LBN 61
+ #define TX_IPFIL30_PORT1_EN_WIDTH 1
+ #define TX_IPFIL30_PORT0_EN_LBN 60
+ #define TX_IPFIL30_PORT0_EN_WIDTH 1
+ #define TX_IPFIL29_PORT1_EN_LBN 59
+ #define TX_IPFIL29_PORT1_EN_WIDTH 1
+ #define TX_IPFIL29_PORT0_EN_LBN 58
+ #define TX_IPFIL29_PORT0_EN_WIDTH 1
+ #define TX_IPFIL28_PORT1_EN_LBN 57
+ #define TX_IPFIL28_PORT1_EN_WIDTH 1
+ #define TX_IPFIL28_PORT0_EN_LBN 56
+ #define TX_IPFIL28_PORT0_EN_WIDTH 1
+ #define TX_IPFIL27_PORT1_EN_LBN 55
+ #define TX_IPFIL27_PORT1_EN_WIDTH 1
+ #define TX_IPFIL27_PORT0_EN_LBN 54
+ #define TX_IPFIL27_PORT0_EN_WIDTH 1
+ #define TX_IPFIL26_PORT1_EN_LBN 53
+ #define TX_IPFIL26_PORT1_EN_WIDTH 1
+ #define TX_IPFIL26_PORT0_EN_LBN 52
+ #define TX_IPFIL26_PORT0_EN_WIDTH 1
+ #define TX_IPFIL25_PORT1_EN_LBN 51
+ #define TX_IPFIL25_PORT1_EN_WIDTH 1
+ #define TX_IPFIL25_PORT0_EN_LBN 50
+ #define TX_IPFIL25_PORT0_EN_WIDTH 1
+ #define TX_IPFIL24_PORT1_EN_LBN 49
+ #define TX_IPFIL24_PORT1_EN_WIDTH 1
+ #define TX_IPFIL24_PORT0_EN_LBN 48
+ #define TX_IPFIL24_PORT0_EN_WIDTH 1
+ #define TX_IPFIL23_PORT1_EN_LBN 47
+ #define TX_IPFIL23_PORT1_EN_WIDTH 1
+ #define TX_IPFIL23_PORT0_EN_LBN 46
+ #define TX_IPFIL23_PORT0_EN_WIDTH 1
+ #define TX_IPFIL22_PORT1_EN_LBN 45
+ #define TX_IPFIL22_PORT1_EN_WIDTH 1
+ #define TX_IPFIL22_PORT0_EN_LBN 44
+ #define TX_IPFIL22_PORT0_EN_WIDTH 1
+ #define TX_IPFIL21_PORT1_EN_LBN 43
+ #define TX_IPFIL21_PORT1_EN_WIDTH 1
+ #define TX_IPFIL21_PORT0_EN_LBN 42
+ #define TX_IPFIL21_PORT0_EN_WIDTH 1
+ #define TX_IPFIL20_PORT1_EN_LBN 41
+ #define TX_IPFIL20_PORT1_EN_WIDTH 1
+ #define TX_IPFIL20_PORT0_EN_LBN 40
+ #define TX_IPFIL20_PORT0_EN_WIDTH 1
+ #define TX_IPFIL19_PORT1_EN_LBN 39
+ #define TX_IPFIL19_PORT1_EN_WIDTH 1
+ #define TX_IPFIL19_PORT0_EN_LBN 38
+ #define TX_IPFIL19_PORT0_EN_WIDTH 1
+ #define TX_IPFIL18_PORT1_EN_LBN 37
+ #define TX_IPFIL18_PORT1_EN_WIDTH 1
+ #define TX_IPFIL18_PORT0_EN_LBN 36
+ #define TX_IPFIL18_PORT0_EN_WIDTH 1
+ #define TX_IPFIL17_PORT1_EN_LBN 35
+ #define TX_IPFIL17_PORT1_EN_WIDTH 1
+ #define TX_IPFIL17_PORT0_EN_LBN 34
+ #define TX_IPFIL17_PORT0_EN_WIDTH 1
+ #define TX_IPFIL16_PORT1_EN_LBN 33
+ #define TX_IPFIL16_PORT1_EN_WIDTH 1
+ #define TX_IPFIL16_PORT0_EN_LBN 32
+ #define TX_IPFIL16_PORT0_EN_WIDTH 1
+ #define TX_IPFIL15_PORT1_EN_LBN 31
+ #define TX_IPFIL15_PORT1_EN_WIDTH 1
+ #define TX_IPFIL15_PORT0_EN_LBN 30
+ #define TX_IPFIL15_PORT0_EN_WIDTH 1
+ #define TX_IPFIL14_PORT1_EN_LBN 29
+ #define TX_IPFIL14_PORT1_EN_WIDTH 1
+ #define TX_IPFIL14_PORT0_EN_LBN 28
+ #define TX_IPFIL14_PORT0_EN_WIDTH 1
+ #define TX_IPFIL13_PORT1_EN_LBN 27
+ #define TX_IPFIL13_PORT1_EN_WIDTH 1
+ #define TX_IPFIL13_PORT0_EN_LBN 26
+ #define TX_IPFIL13_PORT0_EN_WIDTH 1
+ #define TX_IPFIL12_PORT1_EN_LBN 25
+ #define TX_IPFIL12_PORT1_EN_WIDTH 1
+ #define TX_IPFIL12_PORT0_EN_LBN 24
+ #define TX_IPFIL12_PORT0_EN_WIDTH 1
+ #define TX_IPFIL11_PORT1_EN_LBN 23
+ #define TX_IPFIL11_PORT1_EN_WIDTH 1
+ #define TX_IPFIL11_PORT0_EN_LBN 22
+ #define TX_IPFIL11_PORT0_EN_WIDTH 1
+ #define TX_IPFIL10_PORT1_EN_LBN 21
+ #define TX_IPFIL10_PORT1_EN_WIDTH 1
+ #define TX_IPFIL10_PORT0_EN_LBN 20
+ #define TX_IPFIL10_PORT0_EN_WIDTH 1
+ #define TX_IPFIL9_PORT1_EN_LBN 19
+ #define TX_IPFIL9_PORT1_EN_WIDTH 1
+ #define TX_IPFIL9_PORT0_EN_LBN 18
+ #define TX_IPFIL9_PORT0_EN_WIDTH 1
+ #define TX_IPFIL8_PORT1_EN_LBN 17
+ #define TX_IPFIL8_PORT1_EN_WIDTH 1
+ #define TX_IPFIL8_PORT0_EN_LBN 16
+ #define TX_IPFIL8_PORT0_EN_WIDTH 1
+ #define TX_IPFIL7_PORT1_EN_LBN 15
+ #define TX_IPFIL7_PORT1_EN_WIDTH 1
+ #define TX_IPFIL7_PORT0_EN_LBN 14
+ #define TX_IPFIL7_PORT0_EN_WIDTH 1
+ #define TX_IPFIL6_PORT1_EN_LBN 13
+ #define TX_IPFIL6_PORT1_EN_WIDTH 1
+ #define TX_IPFIL6_PORT0_EN_LBN 12
+ #define TX_IPFIL6_PORT0_EN_WIDTH 1
+ #define TX_IPFIL5_PORT1_EN_LBN 11
+ #define TX_IPFIL5_PORT1_EN_WIDTH 1
+ #define TX_IPFIL5_PORT0_EN_LBN 10
+ #define TX_IPFIL5_PORT0_EN_WIDTH 1
+ #define TX_IPFIL4_PORT1_EN_LBN 9
+ #define TX_IPFIL4_PORT1_EN_WIDTH 1
+ #define TX_IPFIL4_PORT0_EN_LBN 8
+ #define TX_IPFIL4_PORT0_EN_WIDTH 1
+ #define TX_IPFIL3_PORT1_EN_LBN 7
+ #define TX_IPFIL3_PORT1_EN_WIDTH 1
+ #define TX_IPFIL3_PORT0_EN_LBN 6
+ #define TX_IPFIL3_PORT0_EN_WIDTH 1
+ #define TX_IPFIL2_PORT1_EN_LBN 5
+ #define TX_IPFIL2_PORT1_EN_WIDTH 1
+ #define TX_IPFIL2_PORT0_EN_LBN 4
+ #define TX_IPFIL2_PORT0_EN_WIDTH 1
+ #define TX_IPFIL1_PORT1_EN_LBN 3
+ #define TX_IPFIL1_PORT1_EN_WIDTH 1
+ #define TX_IPFIL1_PORT0_EN_LBN 2
+ #define TX_IPFIL1_PORT0_EN_WIDTH 1
+ #define TX_IPFIL0_PORT1_EN_LBN 1
+ #define TX_IPFIL0_PORT1_EN_WIDTH 1
+ #define TX_IPFIL0_PORT0_EN_LBN 0
+ #define TX_IPFIL0_PORT0_EN_WIDTH 1
+#define TX_IPFIL_TBL_OFST 0xB00 // Transmit IP source address filter table
+ #define TX_IPFIL_MASK_LBN 32
+ #define TX_IPFIL_MASK_WIDTH 32
+ #define TX_IP_SRC_ADR_LBN 0
+ #define TX_IP_SRC_ADR_WIDTH 32
+#define TX_PACE_REG_A1_OFST 0xF80000 // Transmit pace control register
+#define TX_PACE_REG_B0_OFST 0xA90 // Transmit pace control register
+ #define TX_PACE_SB_AF_LBN 19
+ #define TX_PACE_SB_AF_WIDTH 10
+ #define TX_PACE_SB_NOTAF_LBN 9
+ #define TX_PACE_SB_NOTAF_WIDTH 10
+ #define TX_PACE_FB_BASE_LBN 5
+ #define TX_PACE_FB_BASE_WIDTH 4
+ #define TX_PACE_BIN_TH_LBN 0
+ #define TX_PACE_BIN_TH_WIDTH 5
+#define TX_PACE_TBL_A1_OFST 0xF80040 // Transmit pacing table
+#define TX_PACE_TBL_FIRST_QUEUE_A1 4
+#define TX_PACE_TBL_B0_OFST 0xF80000 // Transmit pacing table
+#define TX_PACE_TBL_FIRST_QUEUE_B0 0
+ #define TX_PACE_LBN 0
+ #define TX_PACE_WIDTH 5
+
+//////////////---- EE/Flash Registers C Header ----//////////////
+#define EE_SPI_HCMD_REG_KER_OFST 0x100 // SPI host command register
+#define EE_SPI_HCMD_REG_OFST 0x100 // SPI host command register
+ #define EE_SPI_HCMD_CMD_EN_LBN 31
+ #define EE_SPI_HCMD_CMD_EN_WIDTH 1
+ #define EE_WR_TIMER_ACTIVE_LBN 28
+ #define EE_WR_TIMER_ACTIVE_WIDTH 1
+ #define EE_SPI_HCMD_SF_SEL_LBN 24
+ #define EE_SPI_HCMD_SF_SEL_WIDTH 1
+ #define EE_SPI_HCMD_DABCNT_LBN 16
+ #define EE_SPI_HCMD_DABCNT_WIDTH 5
+ #define EE_SPI_HCMD_READ_LBN 15
+ #define EE_SPI_HCMD_READ_WIDTH 1
+ #define EE_SPI_HCMD_DUBCNT_LBN 12
+ #define EE_SPI_HCMD_DUBCNT_WIDTH 2
+ #define EE_SPI_HCMD_ADBCNT_LBN 8
+ #define EE_SPI_HCMD_ADBCNT_WIDTH 2
+ #define EE_SPI_HCMD_ENC_LBN 0
+ #define EE_SPI_HCMD_ENC_WIDTH 8
+#define EE_SPI_HADR_REG_KER_OFST 0X110 // SPI host address register
+#define EE_SPI_HADR_REG_OFST 0X110 // SPI host address register
+ #define EE_SPI_HADR_DUBYTE_LBN 24
+ #define EE_SPI_HADR_DUBYTE_WIDTH 8
+ #define EE_SPI_HADR_ADR_LBN 0
+ #define EE_SPI_HADR_ADR_WIDTH 24
+#define EE_SPI_HDATA_REG_KER_OFST 0x120 // SPI host data register
+#define EE_SPI_HDATA_REG_OFST 0x120 // SPI host data register
+ #define EE_SPI_HDATA3_LBN 96
+ #define EE_SPI_HDATA3_WIDTH 32
+ #define EE_SPI_HDATA2_LBN 64
+ #define EE_SPI_HDATA2_WIDTH 32
+ #define EE_SPI_HDATA1_LBN 32
+ #define EE_SPI_HDATA1_WIDTH 32
+ #define EE_SPI_HDATA0_LBN 0
+ #define EE_SPI_HDATA0_WIDTH 32
+#define EE_BASE_PAGE_REG_KER_OFST 0x130 // Expansion ROM base mirror register
+#define EE_BASE_PAGE_REG_OFST 0x130 // Expansion ROM base mirror register
+ #define EE_EXP_ROM_WINDOW_BASE_LBN 16
+ #define EE_EXP_ROM_WINDOW_BASE_WIDTH 13
+ #define EE_EXPROM_MASK_LBN 0
+ #define EE_EXPROM_MASK_WIDTH 13
+#define EE_VPD_CFG0_REG_KER_OFST 0X140 // SPI/VPD configuration register
+#define EE_VPD_CFG0_REG_OFST 0X140 // SPI/VPD configuration register
+ #define EE_SF_FASTRD_EN_LBN 127
+ #define EE_SF_FASTRD_EN_WIDTH 1
+ #define EE_SF_CLOCK_DIV_LBN 120
+ #define EE_SF_CLOCK_DIV_WIDTH 7
+ #define EE_VPD_WIP_POLL_LBN 119
+ #define EE_VPD_WIP_POLL_WIDTH 1
+ #define EE_VPDW_LENGTH_LBN 80
+ #define EE_VPDW_LENGTH_WIDTH 15
+ #define EE_VPDW_BASE_LBN 64
+ #define EE_VPDW_BASE_WIDTH 15
+ #define EE_VPD_WR_CMD_EN_LBN 56
+ #define EE_VPD_WR_CMD_EN_WIDTH 8
+ #define EE_VPD_BASE_LBN 32
+ #define EE_VPD_BASE_WIDTH 24
+ #define EE_VPD_LENGTH_LBN 16
+ #define EE_VPD_LENGTH_WIDTH 13
+ #define EE_VPD_AD_SIZE_LBN 8
+ #define EE_VPD_AD_SIZE_WIDTH 5
+ #define EE_VPD_ACCESS_ON_LBN 5
+ #define EE_VPD_ACCESS_ON_WIDTH 1
+#define EE_VPD_SW_CNTL_REG_KER_OFST 0X150 // VPD access SW control register
+#define EE_VPD_SW_CNTL_REG_OFST 0X150 // VPD access SW control register
+ #define EE_VPD_CYCLE_PENDING_LBN 31
+ #define EE_VPD_CYCLE_PENDING_WIDTH 1
+ #define EE_VPD_CYC_WRITE_LBN 28
+ #define EE_VPD_CYC_WRITE_WIDTH 1
+ #define EE_VPD_CYC_ADR_LBN 0
+ #define EE_VPD_CYC_ADR_WIDTH 15
+#define EE_VPD_SW_DATA_REG_KER_OFST 0x160 // VPD access SW data register
+#define EE_VPD_SW_DATA_REG_OFST 0x160 // VPD access SW data register
+ #define EE_VPD_CYC_DAT_LBN 0
+ #define EE_VPD_CYC_DAT_WIDTH 32
--- /dev/null
+//////////////---- Descriptors C Headers ----//////////////
+// Receive Kernel IP Descriptor
+ #define RX_KER_BUF_SIZE_LBN 48
+ #define RX_KER_BUF_SIZE_WIDTH 14
+ #define RX_KER_BUF_REGION_LBN 46
+ #define RX_KER_BUF_REGION_WIDTH 2
+ #define RX_KER_BUF_REGION0_DECODE 0
+ #define RX_KER_BUF_REGION1_DECODE 1
+ #define RX_KER_BUF_REGION2_DECODE 2
+ #define RX_KER_BUF_REGION3_DECODE 3
+ #define RX_KER_BUF_ADR_LBN 0
+ #define RX_KER_BUF_ADR_WIDTH 46
+// Receive User IP Descriptor
+ #define RX_USR_2BYTE_OFS_LBN 20
+ #define RX_USR_2BYTE_OFS_WIDTH 12
+ #define RX_USR_BUF_ID_LBN 0
+ #define RX_USR_BUF_ID_WIDTH 20
+// Transmit Kernel IP Descriptor
+ #define TX_KER_PORT_LBN 63
+ #define TX_KER_PORT_WIDTH 1
+ #define TX_KER_CONT_LBN 62
+ #define TX_KER_CONT_WIDTH 1
+ #define TX_KER_BYTE_CNT_LBN 48
+ #define TX_KER_BYTE_CNT_WIDTH 14
+ #define TX_KER_BUF_REGION_LBN 46
+ #define TX_KER_BUF_REGION_WIDTH 2
+ #define TX_KER_BUF_REGION0_DECODE 0
+ #define TX_KER_BUF_REGION1_DECODE 1
+ #define TX_KER_BUF_REGION2_DECODE 2
+ #define TX_KER_BUF_REGION3_DECODE 3
+ #define TX_KER_BUF_ADR_LBN 0
+ #define TX_KER_BUF_ADR_WIDTH 46
+// Transmit User IP Descriptor
+ #define TX_USR_PORT_LBN 47
+ #define TX_USR_PORT_WIDTH 1
+ #define TX_USR_CONT_LBN 46
+ #define TX_USR_CONT_WIDTH 1
+ #define TX_USR_BYTE_CNT_LBN 33
+ #define TX_USR_BYTE_CNT_WIDTH 13
+ #define TX_USR_BUF_ID_LBN 13
+ #define TX_USR_BUF_ID_WIDTH 20
+ #define TX_USR_BYTE_OFS_LBN 0
+ #define TX_USR_BYTE_OFS_WIDTH 13
--- /dev/null
+//////////////---- Events Format C Header ----//////////////
+//////////////---- Event entry ----//////////////
+ #define EV_CODE_LBN 60
+ #define EV_CODE_WIDTH 4
+ #define RX_IP_EV_DECODE 0
+ #define TX_IP_EV_DECODE 2
+ #define DRIVER_EV_DECODE 5
+ #define GLOBAL_EV_DECODE 6
+ #define DRV_GEN_EV_DECODE 7
+ #define EV_DATA_LBN 0
+ #define EV_DATA_WIDTH 60
+//////////////---- Receive IP events for both Kernel & User event queues ----//////////////
+ #define RX_EV_PKT_OK_LBN 56
+ #define RX_EV_PKT_OK_WIDTH 1
+ #define RX_EV_BUF_OWNER_ID_ERR_LBN 54
+ #define RX_EV_BUF_OWNER_ID_ERR_WIDTH 1
+ #define RX_EV_IP_HDR_CHKSUM_ERR_LBN 52
+ #define RX_EV_IP_HDR_CHKSUM_ERR_WIDTH 1
+ #define RX_EV_TCP_UDP_CHKSUM_ERR_LBN 51
+ #define RX_EV_TCP_UDP_CHKSUM_ERR_WIDTH 1
+ #define RX_EV_ETH_CRC_ERR_LBN 50
+ #define RX_EV_ETH_CRC_ERR_WIDTH 1
+ #define RX_EV_FRM_TRUNC_LBN 49
+ #define RX_EV_FRM_TRUNC_WIDTH 1
+ #define RX_EV_DRIB_NIB_LBN 48
+ #define RX_EV_DRIB_NIB_WIDTH 1
+ #define RX_EV_TOBE_DISC_LBN 47
+ #define RX_EV_TOBE_DISC_WIDTH 1
+ #define RX_EV_PKT_TYPE_LBN 44
+ #define RX_EV_PKT_TYPE_WIDTH 3
+ #define RX_EV_PKT_TYPE_ETH_DECODE 0
+ #define RX_EV_PKT_TYPE_LLC_DECODE 1
+ #define RX_EV_PKT_TYPE_JUMBO_DECODE 2
+ #define RX_EV_PKT_TYPE_VLAN_DECODE 3
+ #define RX_EV_PKT_TYPE_VLAN_LLC_DECODE 4
+ #define RX_EV_PKT_TYPE_VLAN_JUMBO_DECODE 5
+ #define RX_EV_HDR_TYPE_LBN 42
+ #define RX_EV_HDR_TYPE_WIDTH 2
+ #define RX_EV_HDR_TYPE_TCP_IPV4_DECODE 0
+ #define RX_EV_HDR_TYPE_UDP_IPV4_DECODE 1
+ #define RX_EV_HDR_TYPE_OTHER_IP_DECODE 2
+ #define RX_EV_HDR_TYPE_NON_IP_DECODE 3
+ #define RX_EV_DESC_Q_EMPTY_LBN 41
+ #define RX_EV_DESC_Q_EMPTY_WIDTH 1
+ #define RX_EV_MCAST_HASH_MATCH_LBN 40
+ #define RX_EV_MCAST_HASH_MATCH_WIDTH 1
+ #define RX_EV_MCAST_PKT_LBN 39
+ #define RX_EV_MCAST_PKT_WIDTH 1
+ #define RX_EV_Q_LABEL_LBN 32
+ #define RX_EV_Q_LABEL_WIDTH 5
+ #define RX_JUMBO_CONT_LBN 31
+ #define RX_JUMBO_CONT_WIDTH 1
+ #define RX_SOP_LBN 15
+ #define RX_SOP_WIDTH 1
+ #define RX_PORT_LBN 30
+ #define RX_PORT_WIDTH 1
+ #define RX_EV_BYTE_CNT_LBN 16
+ #define RX_EV_BYTE_CNT_WIDTH 14
+ #define RX_iSCSI_PKT_OK_LBN 14
+ #define RX_iSCSI_PKT_OK_WIDTH 1
+ #define RX_ISCSI_DDIG_ERR_LBN 13
+ #define RX_ISCSI_DDIG_ERR_WIDTH 1
+ #define RX_ISCSI_HDIG_ERR_LBN 12
+ #define RX_ISCSI_HDIG_ERR_WIDTH 1
+ #define RX_EV_DESC_PTR_LBN 0
+ #define RX_EV_DESC_PTR_WIDTH 12
+//////////////---- Transmit IP events for both Kernel & User event queues ----//////////////
+ #define TX_EV_PKT_ERR_LBN 38
+ #define TX_EV_PKT_ERR_WIDTH 1
+ #define TX_EV_PKT_TOO_BIG_LBN 37
+ #define TX_EV_PKT_TOO_BIG_WIDTH 1
+ #define TX_EV_Q_LABEL_LBN 32
+ #define TX_EV_Q_LABEL_WIDTH 5
+ #define TX_EV_PORT_LBN 16
+ #define TX_EV_PORT_WIDTH 1
+ #define TX_EV_WQ_FF_FULL_LBN 15
+ #define TX_EV_WQ_FF_FULL_WIDTH 1
+ #define TX_EV_BUF_OWNER_ID_ERR_LBN 14
+ #define TX_EV_BUF_OWNER_ID_ERR_WIDTH 1
+ #define TX_EV_COMP_LBN 12
+ #define TX_EV_COMP_WIDTH 1
+ #define TX_EV_DESC_PTR_LBN 0
+ #define TX_EV_DESC_PTR_WIDTH 12
+//////////////---- Char or Kernel driver events ----//////////////
+ #define DRIVER_EV_SUB_CODE_LBN 56
+ #define DRIVER_EV_SUB_CODE_WIDTH 4
+ #define TX_DESCQ_FLS_DONE_EV_DECODE 0x0
+ #define RX_DESCQ_FLS_DONE_EV_DECODE 0x1
+ #define EVQ_INIT_DONE_EV_DECODE 0x2
+ #define EVQ_NOT_EN_EV_DECODE 0x3
+ #define RX_DESCQ_FLSFF_OVFL_EV_DECODE 0x4
+ #define SRM_UPD_DONE_EV_DECODE 0x5
+ #define WAKE_UP_EV_DECODE 0x6
+ #define TX_PKT_NON_TCP_UDP_DECODE 0x9
+ #define TIMER_EV_DECODE 0xA
+ #define RX_DSC_ERROR_EV_DECODE 0xE
+ #define DRIVER_EV_TX_DESCQ_ID_LBN 0
+ #define DRIVER_EV_TX_DESCQ_ID_WIDTH 12
+ #define DRIVER_EV_RX_DESCQ_ID_LBN 0
+ #define DRIVER_EV_RX_DESCQ_ID_WIDTH 12
+ #define DRIVER_EV_EVQ_ID_LBN 0
+ #define DRIVER_EV_EVQ_ID_WIDTH 12
+ #define DRIVER_TMR_ID_LBN 0
+ #define DRIVER_TMR_ID_WIDTH 12
+ #define DRIVER_EV_SRM_UPD_LBN 0
+ #define DRIVER_EV_SRM_UPD_WIDTH 2
+ #define SRM_CLR_EV_DECODE 0
+ #define SRM_UPD_EV_DECODE 1
+ #define SRM_ILLCLR_EV_DECODE 2
+//////////////---- Global events. Sent to both event queue 0 and 4. ----//////////////
+ #define XFP_PHY_INTR_LBN 10
+ #define XFP_PHY_INTR_WIDTH 1
+ #define XG_PHY_INTR_LBN 9
+ #define XG_PHY_INTR_WIDTH 1
+ #define G_PHY1_INTR_LBN 8
+ #define G_PHY1_INTR_WIDTH 1
+ #define G_PHY0_INTR_LBN 7
+ #define G_PHY0_INTR_WIDTH 1
+//////////////---- Driver generated events ----//////////////
+ #define DRV_GEN_EV_CODE_LBN 60
+ #define DRV_GEN_EV_CODE_WIDTH 4
+ #define DRV_GEN_EV_DATA_LBN 0
+ #define DRV_GEN_EV_DATA_WIDTH 60
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Really-and-truely-honestly internal stuff for libef.
+ * \date 2004/06/13
+ */
+
+/*! \cidoxg_include_ci_ul */
+#ifndef __CI_EF_VI_INTERNAL_H__
+#define __CI_EF_VI_INTERNAL_H__
+
+
+/* These flags share space with enum ef_vi_flags. */
+#define EF_VI_BUG5692_WORKAROUND 0x10000
+
+
+/* ***********************************************************************
+ * COMPILATION CONTROL FLAGS (see ef_vi.h for "workaround" controls)
+ */
+
+#define EF_VI_DO_MAGIC_CHECKS 1
+
+
+/**********************************************************************
+ * Headers
+ */
+
+#include <etherfabric/ef_vi.h>
+#include "sysdep.h"
+#include "ef_vi_falcon.h"
+
+
+/**********************************************************************
+ * Debugging.
+ */
+
+#ifndef NDEBUG
+
+# define _ef_assert(exp, file, line) BUG_ON(!(exp));
+
+# define _ef_assert2(exp, x, y, file, line) do { \
+ if (unlikely(!(exp))) \
+ BUG(); \
+ } while (0)
+
+#else
+
+# define _ef_assert(exp, file, line)
+# define _ef_assert2(e, x, y, file, line)
+
+#endif
+
+#define ef_assert(a) do{ _ef_assert((a),__FILE__,__LINE__); } while(0)
+#define ef_assert_equal(a,b) _ef_assert2((a)==(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_eq ef_assert_equal
+#define ef_assert_lt(a,b) _ef_assert2((a)<(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_le(a,b) _ef_assert2((a)<=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_nequal(a,b) _ef_assert2((a)!=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_ne ef_assert_nequal
+#define ef_assert_ge(a,b) _ef_assert2((a)>=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_gt(a,b) _ef_assert2((a)>(b),(a),(b),__FILE__,__LINE__)
+
+/**********************************************************************
+ * Debug checks. ******************************************************
+ **********************************************************************/
+
+#ifdef NDEBUG
+# define EF_VI_MAGIC_SET(p, type)
+# define EF_VI_CHECK_VI(p)
+# define EF_VI_CHECK_EVENT_Q(p)
+# define EF_VI_CHECK_IOBUFSET(p)
+# define EF_VI_CHECK_FILTER(p)
+# define EF_VI_CHECK_SHMBUF(p)
+# define EF_VI_CHECK_PT_EP(p)
+#else
+# define EF_VI 0x3
+# define EF_EPLOCK 0x6
+# define EF_IOBUFSET 0x9
+# define EF_FILTER 0xa
+# define EF_SHMBUF 0x11
+
+# define EF_VI_MAGIC(p, type) \
+ (((unsigned)(type) << 28) | \
+ (((unsigned)(intptr_t)(p)) & 0x0fffffffu))
+
+# if !EF_VI_DO_MAGIC_CHECKS
+# define EF_VI_MAGIC_SET(p, type)
+# define EF_VI_MAGIC_CHECK(p, type)
+# else
+# define EF_VI_MAGIC_SET(p, type) \
+ do { \
+ (p)->magic = EF_VI_MAGIC((p), (type)); \
+ } while (0)
+
+# define EF_VI_MAGIC_OKAY(p, type) \
+ ((p)->magic == EF_VI_MAGIC((p), (type)))
+
+# define EF_VI_MAGIC_CHECK(p, type) \
+ ef_assert(EF_VI_MAGIC_OKAY((p), (type)))
+
+#endif /* EF_VI_DO_MAGIC_CHECKS */
+
+# define EF_VI_CHECK_VI(p) \
+ ef_assert(p); \
+ EF_VI_MAGIC_CHECK((p), EF_VI);
+
+# define EF_VI_CHECK_EVENT_Q(p) \
+ ef_assert(p); \
+ EF_VI_MAGIC_CHECK((p), EF_VI); \
+ ef_assert((p)->evq_base); \
+ ef_assert((p)->evq_mask);
+
+# define EF_VI_CHECK_PT_EP(p) \
+ ef_assert(p); \
+ EF_VI_MAGIC_CHECK((p), EF_VI); \
+ ef_assert((p)->ep_state);
+
+# define EF_VI_CHECK_IOBUFSET(p) \
+ ef_assert(p); \
+ EF_VI_MAGIC_CHECK((p), EF_IOBUFSET)
+
+# define EF_VI_CHECK_FILTER(p) \
+ ef_assert(p); \
+ EF_VI_MAGIC_CHECK((p), EF_FILTER);
+
+# define EF_VI_CHECK_SHMBUF(p) \
+ ef_assert(p); \
+ EF_VI_MAGIC_CHECK((p), EF_SHMBUF);
+
+#endif
+
+#ifndef NDEBUG
+# define EF_DRIVER_MAGIC 0x00f00ba4
+# define EF_ASSERT_THIS_DRIVER_VALID(driver) \
+ do{ ef_assert(driver); \
+ EF_VI_MAGIC_CHECK((driver), EF_DRIVER_MAGIC); \
+ ef_assert((driver)->init); }while(0)
+
+# define EF_ASSERT_DRIVER_VALID() EF_ASSERT_THIS_DRIVER_VALID(&ci_driver)
+#else
+# define EF_ASSERT_THIS_DRIVER_VALID(driver)
+# define EF_ASSERT_DRIVER_VALID()
+#endif
+
+
+/* *************************************
+ * Power of 2 FIFO
+ */
+
+#define EF_VI_FIFO2_M(f, x) ((x) & ((f)->fifo_mask))
+#define ef_vi_fifo2_valid(f) ((f) && (f)->fifo && (f)->fifo_mask > 0 && \
+ (f)->fifo_rd_i <= (f)->fifo_mask && \
+ (f)->fifo_wr_i <= (f)->fifo_mask && \
+ EF_VI_IS_POW2((f)->fifo_mask+1u))
+
+#define ef_vi_fifo2_init(f, cap) \
+ do{ ef_assert(EF_VI_IS_POW2((cap) + 1)); \
+ (f)->fifo_rd_i = (f)->fifo_wr_i = 0u; \
+ (f)->fifo_mask = (cap); \
+ }while(0)
+
+#define ef_vi_fifo2_is_empty(f) ((f)->fifo_rd_i == (f)->fifo_wr_i)
+#define ef_vi_fifo2_capacity(f) ((f)->fifo_mask)
+#define ef_vi_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
+#define ef_vi_fifo2_end(f) ((f)->fifo + ef_vi_fifo2_buf_size(f))
+#define ef_vi_fifo2_peek(f) ((f)->fifo[(f)->fifo_rd_i])
+#define ef_vi_fifo2_poke(f) ((f)->fifo[(f)->fifo_wr_i])
+#define ef_vi_fifo2_num(f) EF_VI_FIFO2_M((f),(f)->fifo_wr_i-(f)->fifo_rd_i)
+
+#define ef_vi_fifo2_wr_prev(f) \
+ do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i - 1u); }while(0)
+#define ef_vi_fifo2_wr_next(f) \
+ do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i + 1u); }while(0)
+#define ef_vi_fifo2_rd_adv(f, n) \
+ do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + (n)); }while(0)
+#define ef_vi_fifo2_rd_prev(f) \
+ do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i - 1u); }while(0)
+#define ef_vi_fifo2_rd_next(f) \
+ do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + 1u); }while(0)
+
+#define ef_vi_fifo2_put(f, v) \
+ do{ ef_vi_fifo2_poke(f) = (v); ef_vi_fifo2_wr_next(f); }while(0)
+#define ef_vi_fifo2_get(f, pv) \
+ do{ *(pv) = ef_vi_fifo2_peek(f); ef_vi_fifo2_rd_next(f); }while(0)
+
+
+/* *********************************************************************
+ * Eventq handling
+ */
+
+typedef union {
+ uint64_t u64;
+ struct {
+ uint32_t a;
+ uint32_t b;
+ } opaque;
+} ef_vi_event;
+
+
+#define EF_VI_EVENT_OFFSET(q, i) \
+ (((q)->evq_state->evq_ptr - (i) * sizeof(ef_vi_event)) & (q)->evq_mask)
+
+#define EF_VI_EVENT_PTR(q, i) \
+ ((ef_vi_event*) ((q)->evq_base + EF_VI_EVENT_OFFSET((q), (i))))
+
+/* *********************************************************************
+ * Miscellaneous goodies
+ */
+#ifdef NDEBUG
+# define EF_VI_DEBUG(x)
+#else
+# define EF_VI_DEBUG(x) x
+#endif
+
+#define EF_VI_ROUND_UP(i, align) (((i)+(align)-1u) & ~((align)-1u))
+#define EF_VI_ALIGN_FWD(p, align) (((p)+(align)-1u) & ~((align)-1u))
+#define EF_VI_ALIGN_BACK(p, align) ((p) & ~((align)-1u))
+#define EF_VI_PTR_ALIGN_BACK(p, align) \
+ ((char*)EF_VI_ALIGN_BACK(((intptr_t)(p)), ((intptr_t)(align))))
+#define EF_VI_IS_POW2(x) ((x) && ! ((x) & ((x) - 1)))
+
+
+/* ********************************************************************
+ */
+
+extern void falcon_vi_init(ef_vi*, void* vvis ) EF_VI_HF;
+extern void ef_eventq_state_init(ef_vi* evq) EF_VI_HF;
+extern void __ef_init(void) EF_VI_HF;
+
+
+#endif /* __CI_EF_VI_INTERNAL_H__ */
+
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \brief Virtual Interface
+ * \date 2007/05/16
+ */
+
+#ifndef __EFAB_EF_VI_H__
+#define __EFAB_EF_VI_H__
+
+
+/**********************************************************************
+ * Primitive types ****************************************************
+ **********************************************************************/
+
+/* We standardise on the types from stdint.h and synthesise these types
+ * for compilers/platforms that don't provide them */
+
+# include <linux/types.h>
+# define EF_VI_ALIGN(x) __attribute__ ((aligned (x)))
+# define ef_vi_inline static inline
+
+
+
+/**********************************************************************
+ * Types **************************************************************
+ **********************************************************************/
+
+typedef uint32_t ef_eventq_ptr;
+
+typedef uint64_t ef_addr;
+typedef char* ef_vi_ioaddr_t;
+
+/**********************************************************************
+ * ef_event ***********************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi A DMA request identifier.
+**
+** This is an integer token specified by the transport and associated
+** with a DMA request. It is returned to the VI user with DMA completion
+** events. It is typically used to identify the buffer associated with
+** the transfer.
+*/
+typedef int ef_request_id;
+
+typedef union {
+ uint64_t u64[1];
+ uint32_t u32[2];
+} ef_vi_qword;
+
+typedef ef_vi_qword ef_hw_event;
+
+#define EF_REQUEST_ID_BITS 16u
+#define EF_REQUEST_ID_MASK ((1u << EF_REQUEST_ID_BITS) - 1u)
+
+/*! \i_ef_event An [ef_event] is a token that identifies something that
+** has happened. Examples include packets received, packets transmitted
+** and errors.
+*/
+typedef union {
+ struct {
+ ef_hw_event ev;
+ unsigned type :16;
+ } generic;
+ struct {
+ ef_hw_event ev;
+ unsigned type :16;
+ /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/
+ unsigned q_id :16;
+ unsigned len :16;
+ unsigned flags :16;
+ } rx;
+ struct { /* This *must* have same layout as [rx]. */
+ ef_hw_event ev;
+ unsigned type :16;
+ /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/
+ unsigned q_id :16;
+ unsigned len :16;
+ unsigned flags :16;
+ unsigned subtype :16;
+ } rx_discard;
+ struct {
+ ef_hw_event ev;
+ unsigned type :16;
+ /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/
+ unsigned q_id :16;
+ } tx;
+ struct {
+ ef_hw_event ev;
+ unsigned type :16;
+ /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/
+ unsigned q_id :16;
+ unsigned subtype :16;
+ } tx_error;
+ struct {
+ ef_hw_event ev;
+ unsigned type :16;
+ unsigned q_id :16;
+ } rx_no_desc_trunc;
+ struct {
+ ef_hw_event ev;
+ unsigned type :16;
+ unsigned data;
+ } sw;
+} ef_event;
+
+
+#define EF_EVENT_TYPE(e) ((e).generic.type)
+enum {
+ /** Good data was received. */
+ EF_EVENT_TYPE_RX,
+ /** Packets have been sent. */
+ EF_EVENT_TYPE_TX,
+ /** Data received and buffer consumed, but something is wrong. */
+ EF_EVENT_TYPE_RX_DISCARD,
+ /** Transmit of packet failed. */
+ EF_EVENT_TYPE_TX_ERROR,
+ /** Received packet was truncated due to lack of descriptors. */
+ EF_EVENT_TYPE_RX_NO_DESC_TRUNC,
+ /** Software generated event. */
+ EF_EVENT_TYPE_SW,
+ /** Event queue overflow. */
+ EF_EVENT_TYPE_OFLOW,
+};
+
+#define EF_EVENT_RX_BYTES(e) ((e).rx.len)
+#define EF_EVENT_RX_Q_ID(e) ((e).rx.q_id)
+#define EF_EVENT_RX_CONT(e) ((e).rx.flags & EF_EVENT_FLAG_CONT)
+#define EF_EVENT_RX_SOP(e) ((e).rx.flags & EF_EVENT_FLAG_SOP)
+#define EF_EVENT_RX_ISCSI_OKAY(e) ((e).rx.flags & EF_EVENT_FLAG_ISCSI_OK)
+#define EF_EVENT_FLAG_SOP 0x1
+#define EF_EVENT_FLAG_CONT 0x2
+#define EF_EVENT_FLAG_ISCSI_OK 0x4
+
+#define EF_EVENT_TX_Q_ID(e) ((e).tx.q_id)
+
+#define EF_EVENT_RX_DISCARD_Q_ID(e) ((e).rx_discard.q_id)
+#define EF_EVENT_RX_DISCARD_LEN(e) ((e).rx_discard.len)
+#define EF_EVENT_RX_DISCARD_TYPE(e) ((e).rx_discard.subtype)
+enum {
+ EF_EVENT_RX_DISCARD_CSUM_BAD,
+ EF_EVENT_RX_DISCARD_CRC_BAD,
+ EF_EVENT_RX_DISCARD_TRUNC,
+ EF_EVENT_RX_DISCARD_RIGHTS,
+ EF_EVENT_RX_DISCARD_OTHER,
+};
+
+#define EF_EVENT_TX_ERROR_Q_ID(e) ((e).tx_error.q_id)
+#define EF_EVENT_TX_ERROR_TYPE(e) ((e).tx_error.subtype)
+enum {
+ EF_EVENT_TX_ERROR_RIGHTS,
+ EF_EVENT_TX_ERROR_OFLOW,
+ EF_EVENT_TX_ERROR_2BIG,
+ EF_EVENT_TX_ERROR_BUS,
+};
+
+#define EF_EVENT_RX_NO_DESC_TRUNC_Q_ID(e) ((e).rx_no_desc_trunc.q_id)
+
+#define EF_EVENT_SW_DATA_MASK 0xffff
+#define EF_EVENT_SW_DATA(e) ((e).sw.data)
+
+#define EF_EVENT_FMT "[ev:%x:%08x:%08x]"
+#define EF_EVENT_PRI_ARG(e) (unsigned) (e).generic.type, \
+ (unsigned) (e).generic.ev.u32[1], \
+ (unsigned) (e).generic.ev.u32[0]
+
+#define EF_GET_HW_EV(e) ((e).generic.ev)
+#define EF_GET_HW_EV_PTR(e) (&(e).generic.ev)
+#define EF_GET_HW_EV_U64(e) ((e).generic.ev.u64[0])
+
+
+/* ***************** */
+
+/*! Used by netif shared state. Must use types of explicit size. */
+typedef struct {
+ uint16_t rx_last_desc_ptr; /* for RX duplicates */
+ uint8_t bad_sop; /* bad SOP detected */
+ uint8_t frag_num; /* next fragment #, 0=>SOP */
+} ef_rx_dup_state_t;
+
+
+/* Max number of ports on any SF NIC. */
+#define EFAB_DMAQS_PER_EVQ_MAX 32
+
+typedef struct {
+ ef_eventq_ptr evq_ptr;
+ int32_t trashed;
+ ef_rx_dup_state_t rx_dup_state[EFAB_DMAQS_PER_EVQ_MAX];
+} ef_eventq_state;
+
+
+/*! \i_ef_base [ef_iovec] is similar the standard [struct iovec]. An
+** array of these is used to designate a scatter/gather list of I/O
+** buffers.
+*/
+typedef struct {
+ ef_addr iov_base EF_VI_ALIGN(8);
+ unsigned iov_len;
+} ef_iovec;
+
+/* Falcon constants */
+#define TX_EV_DESC_PTR_LBN 0
+
+/**********************************************************************
+ * ef_iobufset ********************************************************
+ **********************************************************************/
+
+/*! \i_ef_bufs An [ef_iobufset] is a collection of buffers to be used
+** with the NIC.
+*/
+typedef struct ef_iobufset {
+ unsigned magic;
+ unsigned bufs_mmap_bytes;
+ unsigned bufs_handle;
+ int bufs_ptr_off;
+ ef_addr bufs_addr;
+ unsigned bufs_size; /* size rounded to pow2 */
+ int bufs_num;
+ int faultonaccess;
+} ef_iobufset;
+
+
+/**********************************************************************
+ * ef_vi **************************************************************
+ **********************************************************************/
+
+enum ef_vi_flags {
+ EF_VI_RX_SCATTER = 0x1,
+ EF_VI_ISCSI_RX_HDIG = 0x2,
+ EF_VI_ISCSI_TX_HDIG = 0x4,
+ EF_VI_ISCSI_RX_DDIG = 0x8,
+ EF_VI_ISCSI_TX_DDIG = 0x10,
+ EF_VI_TX_PHYS_ADDR = 0x20,
+ EF_VI_RX_PHYS_ADDR = 0x40,
+ EF_VI_TX_IP_CSUM_DIS = 0x80,
+ EF_VI_TX_TCPUDP_CSUM_DIS= 0x100,
+ EF_VI_TX_TCPUDP_ONLY = 0x200,
+ /* Flags in range 0xXXXX0000 are for internal use. */
+};
+
+typedef struct {
+ uint32_t added;
+ uint32_t removed;
+} ef_vi_txq_state;
+
+typedef struct {
+ uint32_t added;
+ uint32_t removed;
+} ef_vi_rxq_state;
+
+typedef struct {
+ uint32_t mask;
+ void* doorbell;
+ void* descriptors;
+ uint16_t* ids;
+ unsigned misalign_mask;
+} ef_vi_txq;
+
+typedef struct {
+ uint32_t mask;
+ void* doorbell;
+ void* descriptors;
+ uint16_t* ids;
+} ef_vi_rxq;
+
+typedef struct {
+ ef_eventq_state evq;
+ ef_vi_txq_state txq;
+ ef_vi_rxq_state rxq;
+ /* Followed by request id fifos. */
+} ef_vi_state;
+
+/*! \i_ef_vi A virtual interface.
+**
+** An [ef_vi] represents a virtual interface on a specific NIC. A
+** virtual interface is a collection of an event queue and two DMA queues
+** used to pass Ethernet frames between the transport implementation and
+** the network.
+*/
+typedef struct ef_vi {
+ unsigned magic;
+
+ unsigned vi_resource_id;
+ unsigned vi_resource_handle_hack;
+ unsigned vi_i;
+
+ char* vi_mem_mmap_ptr;
+ int vi_mem_mmap_bytes;
+ char* vi_io_mmap_ptr;
+ int vi_io_mmap_bytes;
+
+ ef_eventq_state* evq_state;
+ char* evq_base;
+ unsigned evq_mask;
+ ef_vi_ioaddr_t evq_timer_reg;
+
+ ef_vi_txq vi_txq;
+ ef_vi_rxq vi_rxq;
+ ef_vi_state* ep_state;
+ enum ef_vi_flags vi_flags;
+} ef_vi;
+
+
+enum ef_vi_arch {
+ EF_VI_ARCH_FALCON,
+};
+
+
+struct ef_vi_nic_type {
+ unsigned char arch;
+ char variant;
+ unsigned char revision;
+};
+
+
+/* This structure is opaque to the client & used to pass mapping data
+ * from the resource manager to the ef_vi lib. for ef_vi_init().
+ */
+struct vi_mappings {
+ uint32_t signature;
+# define VI_MAPPING_VERSION 0x02 /*Byte: Increment me if struct altered*/
+# define VI_MAPPING_SIGNATURE (0xBA1150 + VI_MAPPING_VERSION)
+
+ struct ef_vi_nic_type nic_type;
+
+ int vi_instance;
+
+ unsigned evq_bytes;
+ char* evq_base;
+ ef_vi_ioaddr_t evq_timer_reg;
+
+ unsigned rx_queue_capacity;
+ ef_vi_ioaddr_t rx_dma_ef1;
+ char* rx_dma_falcon;
+ ef_vi_ioaddr_t rx_bell;
+
+ unsigned tx_queue_capacity;
+ ef_vi_ioaddr_t tx_dma_ef1;
+ char* tx_dma_falcon;
+ ef_vi_ioaddr_t tx_bell;
+};
+/* This is used by clients to allocate a suitably sized buffer for the
+ * resource manager to fill & ef_vi_init() to use. */
+#define VI_MAPPINGS_SIZE (sizeof(struct vi_mappings))
+
+
+/**********************************************************************
+ * ef_config **********************************************************
+ **********************************************************************/
+
+struct ef_config_t {
+ int log; /* debug logging level */
+};
+
+extern struct ef_config_t ef_config;
+
+
+/**********************************************************************
+ * ef_vi **************************************************************
+ **********************************************************************/
+
+/* Initialise [data_area] with information required to initialise an ef_vi.
+ * In the following, an unused param should be set to NULL. Note the case
+ * marked (*) of [iobuf_mmap] for falcon/driver; for normal driver this
+ * must be NULL.
+ *
+ * \param data_area [in,out] required, must ref at least VI_MAPPINGS_SIZE
+ * bytes
+ * \param evq_capacity [in] number of events in event queue. Specify 0 for
+ * no event queue.
+ * \param rxq_capacity [in] number of descriptors in RX DMA queue. Specify
+ * 0 for no RX queue.
+ * \param txq_capacity [in] number of descriptors in TX DMA queue. Specify
+ * 0 for no TX queue.
+ * \param mmap_info [in] mem-map info for resource
+ * \param io_mmap [in] ef1, required
+ * falcon, required
+ * \param iobuf_mmap [in] ef1, UL: unused
+ * falcon, UL: required
+ */
+extern void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type,
+ unsigned rxq_capacity,
+ unsigned txq_capacity, int instance,
+ void* io_mmap, void* iobuf_mmap_rx,
+ void* iobuf_mmap_tx, enum ef_vi_flags);
+
+
+extern void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type,
+ int instance, unsigned evq_bytes,
+ void* base, void* timer_reg);
+
+ef_vi_inline unsigned ef_vi_resource_id(ef_vi* vi)
+{
+ return vi->vi_resource_id;
+}
+
+ef_vi_inline enum ef_vi_flags ef_vi_flags(ef_vi* vi)
+{
+ return vi->vi_flags;
+}
+
+
+/**********************************************************************
+ * Receive interface **************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi Returns the amount of space in the RX descriptor ring.
+**
+** \return the amount of space in the queue.
+*/
+ef_vi_inline int ef_vi_receive_space(ef_vi* vi)
+{
+ ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+ return vi->vi_rxq.mask - (qs->added - qs->removed);
+}
+
+
+/*! \i_ef_vi Returns the fill level of the RX descriptor ring.
+**
+** \return the fill level of the queue.
+*/
+ef_vi_inline int ef_vi_receive_fill_level(ef_vi* vi)
+{
+ ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+ return qs->added - qs->removed;
+}
+
+
+ef_vi_inline int ef_vi_receive_capacity(ef_vi* vi)
+{
+ return vi->vi_rxq.mask;
+}
+
+/*! \i_ef_vi Complete a receive operation.
+**
+** When a receive completion event is received, it should be passed to
+** this function. The request-id for the buffer that the packet was
+** delivered to is returned.
+**
+** After this function returns, more space may be available in the
+** receive queue.
+*/
+extern ef_request_id ef_vi_receive_done(const ef_vi*, const ef_event*);
+
+/*! \i_ef_vi Return request ID indicated by a receive event
+ */
+ef_vi_inline ef_request_id ef_vi_receive_request_id(const ef_vi* vi,
+ const ef_event* ef_ev)
+{
+ const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
+ return ev->u32[0] & vi->vi_rxq.mask;
+}
+
+
+/*! \i_ef_vi Form a receive descriptor.
+**
+** If \c initial_rx_bytes is zero use a reception size at least as large
+** as an MTU.
+*/
+extern int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
+ int intial_rx_bytes);
+
+/*! \i_ef_vi Submit initialised receive descriptors to the NIC. */
+extern void ef_vi_receive_push(ef_vi* vi);
+
+/*! \i_ef_vi Post a buffer on the receive queue.
+**
+** \return 0 on success, or -EAGAIN if the receive queue is full
+*/
+extern int ef_vi_receive_post(ef_vi*, ef_addr addr,
+ ef_request_id dma_id);
+
+/**********************************************************************
+ * Transmit interface *************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi Return the amount of space (in descriptors) in the transmit
+** queue.
+**
+** \return the amount of space in the queue (in descriptors)
+*/
+ef_vi_inline int ef_vi_transmit_space(ef_vi* vi)
+{
+ ef_vi_txq_state* qs = &vi->ep_state->txq;
+ return vi->vi_txq.mask - (qs->added - qs->removed);
+}
+
+
+/*! \i_ef_vi Returns the fill level of the TX descriptor ring.
+**
+** \return the fill level of the queue.
+*/
+ef_vi_inline int ef_vi_transmit_fill_level(ef_vi* vi)
+{
+ ef_vi_txq_state* qs = &vi->ep_state->txq;
+ return qs->added - qs->removed;
+}
+
+
+/*! \i_ef_vi Returns the total capacity of the TX descriptor ring.
+**
+** \return the capacity of the queue.
+*/
+ef_vi_inline int ef_vi_transmit_capacity(ef_vi* vi)
+{
+ return vi->vi_txq.mask;
+}
+
+
+/*! \i_ef_vi Transmit a packet.
+**
+** \param bytes must be greater than ETH_ZLEN.
+** \return -EAGAIN if the transmit queue is full, or 0 on success
+*/
+extern int ef_vi_transmit(ef_vi*, ef_addr, int bytes, ef_request_id dma_id);
+
+/*! \i_ef_vi Transmit a packet using a gather list.
+**
+** \param iov_len must be greater than zero
+** \param iov the first must be non-zero in length (but others need not)
+**
+** \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmitv(ef_vi*, const ef_iovec* iov, int iov_len,
+ ef_request_id dma_id);
+
+/*! \i_ef_vi Initialise a DMA request.
+**
+** \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmit_init(ef_vi*, ef_addr, int bytes,
+ ef_request_id dma_id);
+
+/*! \i_ef_vi Initialise a DMA request.
+**
+** \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmitv_init(ef_vi*, const ef_iovec*, int iov_len,
+ ef_request_id dma_id);
+
+/*! \i_ef_vi Submit DMA requests to the NIC.
+**
+** The DMA requests must have been initialised using
+** ef_vi_transmit_init() or ef_vi_transmitv_init().
+*/
+extern void ef_vi_transmit_push(ef_vi*);
+
+
+/*! \i_ef_vi Maximum number of transmit completions per transmit event. */
+#define EF_VI_TRANSMIT_BATCH 64
+
+/*! \i_ef_vi Determine the set of [ef_request_id]s for each DMA request
+** which has been completed by a given transmit completion
+** event.
+**
+** \param ids must point to an array of length EF_VI_TRANSMIT_BATCH
+** \return the number of valid [ef_request_id]s (can be zero)
+*/
+extern int ef_vi_transmit_unbundle(ef_vi* ep, const ef_event*,
+ ef_request_id* ids);
+
+
+/*! \i_ef_event Returns true if ef_eventq_poll() will return event(s). */
+extern int ef_eventq_has_event(ef_vi* vi);
+
+/*! \i_ef_event Returns true if there are quite a few events in the event
+** queue.
+**
+** This looks ahead in the event queue, so has the property that it will
+** not ping-pong a cache-line when it is called concurrently with events
+** being delivered.
+*/
+extern int ef_eventq_has_many_events(ef_vi* evq, int look_ahead);
+
+/*! Type of function to handle unknown events arriving on event queue
+** Return CI_TRUE iff the event has been handled.
+*/
+typedef int/*bool*/ ef_event_handler_fn(void* priv, ef_vi* evq, ef_event* ev);
+
+/*! Standard poll exception routine */
+extern int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq,
+ ef_event* ev);
+
+/*! \i_ef_event Retrieve events from the event queue, handle RX/TX events
+** and pass any others to an exception handler function
+**
+** \return The number of events retrieved.
+*/
+extern int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
+ ef_event_handler_fn *exception, void *expt_priv);
+
+/*! \i_ef_event Retrieve events from the event queue.
+**
+** \return The number of events retrieved.
+*/
+ef_vi_inline int ef_eventq_poll(ef_vi* evq, ef_event* evs, int evs_len)
+{
+ return ef_eventq_poll_evs(evq, evs, evs_len,
+ &ef_eventq_poll_exception, (void*)0);
+}
+
+/*! \i_ef_event Returns the capacity of an event queue. */
+ef_vi_inline int ef_eventq_capacity(ef_vi* vi)
+{
+ return (vi->evq_mask + 1u) / sizeof(ef_hw_event);
+}
+
+/* Returns the instance ID of [vi] */
+ef_vi_inline unsigned ef_vi_instance(ef_vi* vi)
+{ return vi->vi_i; }
+
+
+/**********************************************************************
+ * Initialisation *****************************************************
+ **********************************************************************/
+
+/*! Return size of state buffer of an initialised VI. */
+extern int ef_vi_state_bytes(ef_vi*);
+
+/*! Return size of buffer needed for VI state given sizes of RX and TX
+** DMA queues. Queue sizes must be legal sizes (power of 2), or 0 (no
+** queue).
+*/
+extern int ef_vi_calc_state_bytes(int rxq_size, int txq_size);
+
+/*! Initialise [ef_vi] from the provided resources. [vvis] must have been
+** created by ef_make_vi_data() & remains owned by the caller.
+*/
+extern void ef_vi_init(ef_vi*, void* vi_info, ef_vi_state* state,
+ ef_eventq_state* evq_state, enum ef_vi_flags);
+
+extern void ef_vi_state_init(ef_vi*);
+extern void ef_eventq_state_init(ef_vi*);
+
+/*! Convert an efhw device arch to ef_vi_arch, or returns -1 if not
+** recognised.
+*/
+extern int ef_vi_arch_from_efhw_arch(int efhw_arch);
+
+
+#endif /* __EFAB_EF_VI_H__ */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Routine to poll event queues.
+ * \date 2003/03/04
+ */
+
+/*! \cidoxg_lib_ef */
+#include "ef_vi_internal.h"
+
+/* Be worried about this on byteswapped machines */
+/* Due to crazy chipsets, we see the event words being written in
+** arbitrary order (bug4539). So test for presence of event must ensure
+** that both halves have changed from the null.
+*/
+# define EF_VI_IS_EVENT(evp) \
+ ( (((evp)->opaque.a != (uint32_t)-1) && \
+ ((evp)->opaque.b != (uint32_t)-1)) )
+
+
+#ifdef NDEBUG
+# define IS_DEBUG 0
+#else
+# define IS_DEBUG 1
+#endif
+
+
+/*! Check for RX events with inconsistent SOP/CONT
+**
+** Returns true if this event should be discarded
+*/
+ef_vi_inline int ef_eventq_is_rx_sop_cont_bad_efab(ef_vi* vi,
+ const ef_vi_qword* ev)
+{
+ ef_rx_dup_state_t* rx_dup_state;
+ uint8_t* bad_sop;
+
+ unsigned label = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+ unsigned sop = QWORD_TEST_BIT(RX_SOP, *ev);
+
+ ef_assert(vi);
+ ef_assert_lt(label, EFAB_DMAQS_PER_EVQ_MAX);
+
+ rx_dup_state = &vi->evq_state->rx_dup_state[label];
+ bad_sop = &rx_dup_state->bad_sop;
+
+ if( ! ((vi->vi_flags & EF_VI_BUG5692_WORKAROUND) || IS_DEBUG) ) {
+ *bad_sop = (*bad_sop && !sop);
+ }
+ else {
+ unsigned cont = QWORD_TEST_BIT(RX_JUMBO_CONT, *ev);
+ uint8_t *frag_num = &rx_dup_state->frag_num;
+
+ /* bad_sop should latch till the next sop */
+ *bad_sop = (*bad_sop && !sop) || ( !!sop != (*frag_num==0) );
+
+ /* we do not check the number of bytes relative to the
+ * fragment number and size of the user rx buffer here
+ * because we don't know the size of the user rx
+ * buffer - we probably should perform this check in
+ * the nearest code calling this though.
+ */
+ *frag_num = cont ? (*frag_num + 1) : 0;
+ }
+
+ return *bad_sop;
+}
+
+
+ef_vi_inline int falcon_rx_check_dup(ef_vi* evq, ef_event* ev_out,
+ const ef_vi_qword* ev)
+{
+ unsigned q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+ unsigned desc_ptr = QWORD_GET_U(RX_EV_DESC_PTR, *ev);
+ ef_rx_dup_state_t* rx_dup_state = &evq->evq_state->rx_dup_state[q_id];
+
+ if(likely( desc_ptr != rx_dup_state->rx_last_desc_ptr )) {
+ rx_dup_state->rx_last_desc_ptr = desc_ptr;
+ return 0;
+ }
+
+ rx_dup_state->rx_last_desc_ptr = desc_ptr;
+ rx_dup_state->bad_sop = 1;
+#ifndef NDEBUG
+ rx_dup_state->frag_num = 0;
+#endif
+ BUG_ON(!QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev));
+ BUG_ON( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev));
+ BUG_ON(!QWORD_GET_U(RX_EV_BYTE_CNT, *ev) == 0);
+ ev_out->rx_no_desc_trunc.type = EF_EVENT_TYPE_RX_NO_DESC_TRUNC;
+ ev_out->rx_no_desc_trunc.q_id = q_id;
+ return 1;
+}
+
+
+ef_vi_inline void falcon_rx_event(ef_event* ev_out, const ef_vi_qword* ev)
+{
+ if(likely( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev) )) {
+ ev_out->rx.type = EF_EVENT_TYPE_RX;
+ ev_out->rx.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+ ev_out->rx.len = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
+ if( QWORD_TEST_BIT(RX_SOP, *ev) )
+ ev_out->rx.flags = EF_EVENT_FLAG_SOP;
+ else
+ ev_out->rx.flags = 0;
+ if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
+ ev_out->rx.flags |= EF_EVENT_FLAG_CONT;
+ if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
+ ev_out->rx.flags |= EF_EVENT_FLAG_ISCSI_OK;
+ }
+ else {
+ ev_out->rx_discard.type = EF_EVENT_TYPE_RX_DISCARD;
+ ev_out->rx_discard.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+ ev_out->rx_discard.len = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
+#if 1 /* hack for ptloop compatability: ?? TODO purge */
+ if( QWORD_TEST_BIT(RX_SOP, *ev) )
+ ev_out->rx_discard.flags = EF_EVENT_FLAG_SOP;
+ else
+ ev_out->rx_discard.flags = 0;
+ if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
+ ev_out->rx_discard.flags |= EF_EVENT_FLAG_CONT;
+ if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
+ ev_out->rx_discard.flags |= EF_EVENT_FLAG_ISCSI_OK;
+#endif
+ /* Order matters here: more fundamental errors first. */
+ if( QWORD_TEST_BIT(RX_EV_BUF_OWNER_ID_ERR, *ev) )
+ ev_out->rx_discard.subtype =
+ EF_EVENT_RX_DISCARD_RIGHTS;
+ else if( QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev) )
+ ev_out->rx_discard.subtype =
+ EF_EVENT_RX_DISCARD_TRUNC;
+ else if( QWORD_TEST_BIT(RX_EV_ETH_CRC_ERR, *ev) )
+ ev_out->rx_discard.subtype =
+ EF_EVENT_RX_DISCARD_CRC_BAD;
+ else if( QWORD_TEST_BIT(RX_EV_IP_HDR_CHKSUM_ERR, *ev) )
+ ev_out->rx_discard.subtype =
+ EF_EVENT_RX_DISCARD_CSUM_BAD;
+ else if( QWORD_TEST_BIT(RX_EV_TCP_UDP_CHKSUM_ERR, *ev) )
+ ev_out->rx_discard.subtype =
+ EF_EVENT_RX_DISCARD_CSUM_BAD;
+ else
+ ev_out->rx_discard.subtype =
+ EF_EVENT_RX_DISCARD_OTHER;
+ }
+}
+
+
+ef_vi_inline void falcon_tx_event(ef_event* ev_out, const ef_vi_qword* ev)
+{
+ /* Danger danger! No matter what we ask for wrt batching, we
+ ** will get a batched event every 16 descriptors, and we also
+ ** get dma-queue-empty events. i.e. Duplicates are expected.
+ **
+ ** In addition, if it's been requested in the descriptor, we
+ ** get an event per descriptor. (We don't currently request
+ ** this).
+ */
+ if(likely( QWORD_TEST_BIT(TX_EV_COMP, *ev) )) {
+ ev_out->tx.type = EF_EVENT_TYPE_TX;
+ ev_out->tx.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
+ }
+ else {
+ ev_out->tx_error.type = EF_EVENT_TYPE_TX_ERROR;
+ ev_out->tx_error.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
+ if(likely( QWORD_TEST_BIT(TX_EV_BUF_OWNER_ID_ERR, *ev) ))
+ ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_RIGHTS;
+ else if(likely( QWORD_TEST_BIT(TX_EV_WQ_FF_FULL, *ev) ))
+ ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_OFLOW;
+ else if(likely( QWORD_TEST_BIT(TX_EV_PKT_TOO_BIG, *ev) ))
+ ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_2BIG;
+ else if(likely( QWORD_TEST_BIT(TX_EV_PKT_ERR, *ev) ))
+ ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_BUS;
+ }
+}
+
+
+static void mark_bad(ef_event* ev)
+{
+ ev->generic.ev.u64[0] &=~ ((uint64_t) 1u << RX_EV_PKT_OK_LBN);
+}
+
+
+int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
+ ef_event_handler_fn *exception, void *expt_priv)
+{
+ int evs_len_orig = evs_len;
+
+ EF_VI_CHECK_EVENT_Q(evq);
+ ef_assert(evs);
+ ef_assert_gt(evs_len, 0);
+
+ if(unlikely( EF_VI_IS_EVENT(EF_VI_EVENT_PTR(evq, 1)) ))
+ goto overflow;
+
+ do {
+ { /* Read the event out of the ring, then fiddle with
+ * copied version. Reason is that the ring is
+ * likely to get pushed out of cache by another
+ * event being delivered by hardware. */
+ ef_vi_event* ev = EF_VI_EVENT_PTR(evq, 0);
+ if( ! EF_VI_IS_EVENT(ev) )
+ break;
+ evs->generic.ev.u64[0] = cpu_to_le64 (ev->u64);
+ evq->evq_state->evq_ptr += sizeof(ef_vi_event);
+ ev->u64 = (uint64_t)(int64_t) -1;
+ }
+
+ /* Ugly: Exploit the fact that event code lies in top
+ * bits of event. */
+ ef_assert_ge(EV_CODE_LBN, 32u);
+ switch( evs->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
+ case RX_IP_EV_DECODE:
+ /* Look for duplicate desc_ptr: it signals
+ * that a jumbo frame was truncated because we
+ * ran out of descriptors. */
+ if(unlikely( falcon_rx_check_dup
+ (evq, evs, &evs->generic.ev) )) {
+ --evs_len;
+ ++evs;
+ break;
+ }
+ else {
+ /* Cope with FalconA1 bugs where RX
+ * gives inconsistent RX events Mark
+ * events as bad until SOP becomes
+ * consistent again
+ * ef_eventq_is_rx_sop_cont_bad() has
+ * side effects - order is important
+ */
+ if(unlikely
+ (ef_eventq_is_rx_sop_cont_bad_efab
+ (evq, &evs->generic.ev) )) {
+ mark_bad(evs);
+ }
+ }
+ falcon_rx_event(evs, &evs->generic.ev);
+ --evs_len;
+ ++evs;
+ break;
+
+ case TX_IP_EV_DECODE:
+ falcon_tx_event(evs, &evs->generic.ev);
+ --evs_len;
+ ++evs;
+ break;
+
+ default:
+ break;
+ }
+ } while( evs_len );
+
+ return evs_len_orig - evs_len;
+
+
+ overflow:
+ evs->generic.type = EF_EVENT_TYPE_OFLOW;
+ evs->generic.ev.u64[0] = (uint64_t)((int64_t)-1);
+ return 1;
+}
+
+
+int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq, ef_event* ev)
+{
+ int /*bool*/ handled = 0;
+
+ switch( ev->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
+ case DRIVER_EV_DECODE:
+ if( QWORD_GET_U(DRIVER_EV_SUB_CODE, ev->generic.ev) ==
+ EVQ_INIT_DONE_EV_DECODE )
+ /* EVQ initialised event: ignore. */
+ handled = 1;
+ break;
+ }
+ return handled;
+}
+
+
+void ef_eventq_iterate(ef_vi* vi,
+ void (*fn)(void* arg, ef_vi*, int rel_pos,
+ int abs_pos, void* event),
+ void* arg, int stop_at_end)
+{
+ int i, size_evs = (vi->evq_mask + 1) / sizeof(ef_vi_event);
+
+ for( i = 0; i < size_evs; ++i ) {
+ ef_vi_event* e = EF_VI_EVENT_PTR(vi, -i);
+ if( EF_VI_IS_EVENT(e) )
+ fn(arg, vi, i,
+ EF_VI_EVENT_OFFSET(vi, -i) / sizeof(ef_vi_event),
+ e);
+ else if( stop_at_end )
+ break;
+ }
+}
+
+
+int ef_eventq_has_event(ef_vi* vi)
+{
+ return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, 0));
+}
+
+
+int ef_eventq_has_many_events(ef_vi* vi, int look_ahead)
+{
+ ef_assert_ge(look_ahead, 0);
+ return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, -look_ahead));
+}
+
+
+int ef_eventq_has_rx_event(ef_vi* vi)
+{
+ ef_vi_event* ev;
+ int i, n_evs = 0;
+
+ for( i = 0; EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, i)); --i ) {
+ ev = EF_VI_EVENT_PTR(vi, i);
+ if( EFVI_FALCON_EVENT_CODE(ev) == EF_EVENT_TYPE_RX ) n_evs++;
+ }
+ return n_evs;
+}
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr, stg
+ * \brief Falcon-specific VI
+ * \date 2006/11/30
+ */
+
+#include "ef_vi_internal.h"
+
+
+#define EFVI_FALCON_DMA_TX_FRAG 1
+
+
+/* TX descriptor for both physical and virtual packet transfers */
+typedef union {
+ uint32_t dword[2];
+} ef_vi_falcon_dma_tx_buf_desc;
+typedef ef_vi_falcon_dma_tx_buf_desc ef_vi_falcon_dma_tx_phys_desc;
+
+
+/* RX descriptor for physical addressed transfers */
+typedef union {
+ uint32_t dword[2];
+} ef_vi_falcon_dma_rx_phys_desc;
+
+
+/* RX descriptor for virtual packet transfers */
+typedef struct {
+ uint32_t dword[1];
+} ef_vi_falcon_dma_rx_buf_desc;
+
+/* Buffer table index */
+typedef uint32_t ef_vi_buffer_addr_t;
+
+ef_vi_inline int64_t dma_addr_to_u46(int64_t src_dma_addr)
+{
+ return (src_dma_addr & __FALCON_MASK(46, int64_t));
+}
+
+/*! Setup a physical address based descriptor with a specified length */
+ef_vi_inline void
+__falcon_dma_rx_calc_ip_phys(ef_vi_dma_addr_t dest_pa,
+ ef_vi_falcon_dma_rx_phys_desc *desc,
+ int bytes)
+{
+ int region = 0; /* TODO fixme */
+ int64_t dest = dma_addr_to_u46(dest_pa); /* lower 46 bits */
+
+ DWCHCK(__DW2(RX_KER_BUF_SIZE_LBN), RX_KER_BUF_SIZE_WIDTH);
+ DWCHCK(__DW2(RX_KER_BUF_REGION_LBN),RX_KER_BUF_REGION_WIDTH);
+
+ LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
+
+ RANGECHCK(bytes, RX_KER_BUF_SIZE_WIDTH);
+ RANGECHCK(region, RX_KER_BUF_REGION_WIDTH);
+
+ ef_assert(desc);
+
+ desc->dword[1] = ((bytes << __DW2(RX_KER_BUF_SIZE_LBN)) |
+ (region << __DW2(RX_KER_BUF_REGION_LBN)) |
+ (HIGH(dest,
+ RX_KER_BUF_ADR_LBN,
+ RX_KER_BUF_ADR_WIDTH)));
+
+ desc->dword[0] = LOW(dest,
+ RX_KER_BUF_ADR_LBN,
+ RX_KER_BUF_ADR_WIDTH);
+}
+
+/*! Setup a virtual buffer descriptor for an IPMODE transfer */
+ef_vi_inline void
+__falcon_dma_tx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, unsigned bytes,
+ int port, int frag,
+ ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+ DWCHCK(__DW2(TX_USR_PORT_LBN), TX_USR_PORT_WIDTH);
+ DWCHCK(__DW2(TX_USR_CONT_LBN), TX_USR_CONT_WIDTH);
+ DWCHCK(__DW2(TX_USR_BYTE_CNT_LBN), TX_USR_BYTE_CNT_WIDTH);
+ LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
+ DWCHCK(TX_USR_BYTE_OFS_LBN, TX_USR_BYTE_OFS_WIDTH);
+
+ RANGECHCK(bytes, TX_USR_BYTE_CNT_WIDTH);
+ RANGECHCK(port, TX_USR_PORT_WIDTH);
+ RANGECHCK(frag, TX_USR_CONT_WIDTH);
+ RANGECHCK(buf_id, TX_USR_BUF_ID_WIDTH);
+ RANGECHCK(buf_ofs, TX_USR_BYTE_OFS_WIDTH);
+
+ ef_assert(desc);
+
+ desc->dword[1] = ((port << __DW2(TX_USR_PORT_LBN)) |
+ (frag << __DW2(TX_USR_CONT_LBN)) |
+ (bytes << __DW2(TX_USR_BYTE_CNT_LBN)) |
+ (HIGH(buf_id,
+ TX_USR_BUF_ID_LBN,
+ TX_USR_BUF_ID_WIDTH)));
+
+ desc->dword[0] = ((LOW(buf_id,
+ TX_USR_BUF_ID_LBN,
+ (TX_USR_BUF_ID_WIDTH))) |
+ (buf_ofs << TX_USR_BYTE_OFS_LBN));
+}
+
+ef_vi_inline void
+falcon_dma_tx_calc_ip_buf_4k(unsigned buf_vaddr, unsigned bytes,
+ int port, int frag,
+ ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+ /* TODO FIXME [buf_vaddr] consists of the buffer index in the
+ ** high bits, and an offset in the low bits. Assumptions
+ ** permate the code that these can be rolled into one 32bit
+ ** value, so this is currently preserved for Falcon. But we
+ ** should change to support 8K pages
+ */
+ unsigned buf_id = EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
+ unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
+
+ __falcon_dma_tx_calc_ip_buf( buf_id, buf_ofs, bytes, port, frag, desc);
+}
+
+ef_vi_inline void
+falcon_dma_tx_calc_ip_buf(unsigned buf_vaddr, unsigned bytes, int port,
+ int frag, ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+ falcon_dma_tx_calc_ip_buf_4k(buf_vaddr, bytes, port, frag, desc);
+}
+
+/*! Setup a virtual buffer based descriptor */
+ef_vi_inline void
+__falcon_dma_rx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs,
+ ef_vi_falcon_dma_rx_buf_desc *desc)
+{
+ /* check alignment of buffer offset and pack */
+ ef_assert((buf_ofs & 0x1) == 0);
+
+ buf_ofs >>= 1;
+
+ DWCHCK(RX_USR_2BYTE_OFS_LBN, RX_USR_2BYTE_OFS_WIDTH);
+ DWCHCK(RX_USR_BUF_ID_LBN, RX_USR_BUF_ID_WIDTH);
+
+ RANGECHCK(buf_ofs, RX_USR_2BYTE_OFS_WIDTH);
+ RANGECHCK(buf_id, RX_USR_BUF_ID_WIDTH);
+
+ ef_assert(desc);
+
+ desc->dword[0] = ((buf_ofs << RX_USR_2BYTE_OFS_LBN) |
+ (buf_id << RX_USR_BUF_ID_LBN));
+}
+
+ef_vi_inline void
+falcon_dma_rx_calc_ip_buf_4k(unsigned buf_vaddr,
+ ef_vi_falcon_dma_rx_buf_desc *desc)
+{
+ /* TODO FIXME [buf_vaddr] consists of the buffer index in the
+ ** high bits, and an offset in the low bits. Assumptions
+ ** permeate the code that these can be rolled into one 32bit
+ ** value, so this is currently preserved for Falcon. But we
+ ** should change to support 8K pages
+ */
+ unsigned buf_id = EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
+ unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
+
+ __falcon_dma_rx_calc_ip_buf(buf_id, buf_ofs, desc);
+}
+
+ef_vi_inline void
+falcon_dma_rx_calc_ip_buf(unsigned buf_vaddr,
+ ef_vi_falcon_dma_rx_buf_desc *desc)
+{
+ falcon_dma_rx_calc_ip_buf_4k(buf_vaddr, desc);
+}
+
+
+ef_vi_inline ef_vi_dma_addr_t ef_physaddr(ef_addr efaddr)
+{
+ return (ef_vi_dma_addr_t) efaddr;
+}
+
+
+/*! Convert between an ef_addr and a buffer table index
+** Assert that this was not a physical address
+*/
+ef_vi_inline ef_vi_buffer_addr_t ef_bufaddr(ef_addr efaddr)
+{
+ ef_assert(efaddr < ((uint64_t)1 << 32) );
+
+ return (ef_vi_buffer_addr_t) efaddr;
+}
+
+
+/*! Setup an physical address based descriptor for an IPMODE transfer */
+ef_vi_inline void
+falcon_dma_tx_calc_ip_phys(ef_vi_dma_addr_t src_dma_addr, unsigned bytes,
+ int port, int frag,
+ ef_vi_falcon_dma_tx_phys_desc *desc)
+{
+
+ int region = 0; /* FIXME */
+ int64_t src = dma_addr_to_u46(src_dma_addr); /* lower 46 bits */
+
+ DWCHCK(__DW2(TX_KER_PORT_LBN), TX_KER_PORT_WIDTH);
+ DWCHCK(__DW2(TX_KER_CONT_LBN), TX_KER_CONT_WIDTH);
+ DWCHCK(__DW2(TX_KER_BYTE_CNT_LBN), TX_KER_BYTE_CNT_WIDTH);
+ DWCHCK(__DW2(TX_KER_BUF_REGION_LBN),TX_KER_BUF_REGION_WIDTH);
+
+ LWCHK(TX_KER_BUF_ADR_LBN, TX_KER_BUF_ADR_WIDTH);
+
+ RANGECHCK(port, TX_KER_PORT_WIDTH);
+ RANGECHCK(frag, TX_KER_CONT_WIDTH);
+ RANGECHCK(bytes, TX_KER_BYTE_CNT_WIDTH);
+ RANGECHCK(region, TX_KER_BUF_REGION_WIDTH);
+
+ desc->dword[1] = ((port << __DW2(TX_KER_PORT_LBN)) |
+ (frag << __DW2(TX_KER_CONT_LBN)) |
+ (bytes << __DW2(TX_KER_BYTE_CNT_LBN)) |
+ (region << __DW2(TX_KER_BUF_REGION_LBN)) |
+ (HIGH(src,
+ TX_KER_BUF_ADR_LBN,
+ TX_KER_BUF_ADR_WIDTH)));
+
+ ef_assert_equal(TX_KER_BUF_ADR_LBN, 0);
+ desc->dword[0] = (uint32_t) src_dma_addr;
+}
+
+
+void falcon_vi_init(ef_vi* vi, void* vvis)
+{
+ struct vi_mappings *vm = (struct vi_mappings*)vvis;
+ uint16_t* ids;
+
+ ef_assert(vi);
+ ef_assert(vvis);
+ ef_assert_equal(vm->signature, VI_MAPPING_SIGNATURE);
+ ef_assert_equal(vm->nic_type.arch, EF_VI_ARCH_FALCON);
+
+ /* Initialise masks to zero, so that ef_vi_state_init() will
+ ** not do any harm when we don't have DMA queues. */
+ vi->vi_rxq.mask = vi->vi_txq.mask = 0;
+
+ /* Used for BUG5391_WORKAROUND. */
+ vi->vi_txq.misalign_mask = 0;
+
+ /* Initialise doorbell addresses to a distinctive small value
+ ** which will cause a segfault, to trap doorbell pushes to VIs
+ ** without DMA queues. */
+ vi->vi_rxq.doorbell = vi->vi_txq.doorbell = (ef_vi_ioaddr_t)0xdb;
+
+ ids = (uint16_t*) (vi->ep_state + 1);
+
+ if( vm->tx_queue_capacity ) {
+ vi->vi_txq.mask = vm->tx_queue_capacity - 1;
+ vi->vi_txq.doorbell = vm->tx_bell + 12;
+ vi->vi_txq.descriptors = vm->tx_dma_falcon;
+ vi->vi_txq.ids = ids;
+ ids += vi->vi_txq.mask + 1;
+ /* Check that the id fifo fits in the space allocated. */
+ ef_assert_le((char*) (vi->vi_txq.ids + vm->tx_queue_capacity),
+ (char*) vi->ep_state
+ + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
+ vm->tx_queue_capacity));
+ }
+ if( vm->rx_queue_capacity ) {
+ vi->vi_rxq.mask = vm->rx_queue_capacity - 1;
+ vi->vi_rxq.doorbell = vm->rx_bell + 12;
+ vi->vi_rxq.descriptors = vm->rx_dma_falcon;
+ vi->vi_rxq.ids = ids;
+ /* Check that the id fifo fits in the space allocated. */
+ ef_assert_le((char*) (vi->vi_rxq.ids + vm->rx_queue_capacity),
+ (char*) vi->ep_state
+ + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
+ vm->tx_queue_capacity));
+ }
+
+ if( vm->nic_type.variant == 'A' ) {
+ vi->vi_txq.misalign_mask = 15; /* BUG5391_WORKAROUND */
+ vi->vi_flags |= EF_VI_BUG5692_WORKAROUND;
+ }
+}
+
+
+int ef_vi_transmitv_init(ef_vi* vi, const ef_iovec* iov, int iov_len,
+ ef_request_id dma_id)
+{
+ ef_vi_txq* q = &vi->vi_txq;
+ ef_vi_txq_state* qs = &vi->ep_state->txq;
+ ef_vi_falcon_dma_tx_buf_desc* dp;
+ unsigned len, dma_len, di;
+ unsigned added_save = qs->added;
+ ef_addr dma_addr;
+ unsigned last_len = 0;
+
+ ef_assert(iov_len > 0);
+ ef_assert(iov);
+ ef_assert_equal((dma_id & EF_REQUEST_ID_MASK), dma_id);
+ ef_assert_nequal(dma_id, 0xffff);
+
+ dma_addr = iov->iov_base;
+ len = iov->iov_len;
+
+ if( vi->vi_flags & EF_VI_ISCSI_TX_DDIG ) {
+ /* Last 4 bytes of placeholder for digest must be
+ * removed for h/w */
+ ef_assert(len > 4);
+ last_len = iov[iov_len - 1].iov_len;
+ if( last_len <= 4 ) {
+ ef_assert(iov_len > 1);
+ --iov_len;
+ last_len = iov[iov_len - 1].iov_len - (4 - last_len);
+ }
+ else {
+ last_len = iov[iov_len - 1].iov_len - 4;
+ }
+ if( iov_len == 1 )
+ len = last_len;
+ }
+
+ while( 1 ) {
+ if( qs->added - qs->removed >= q->mask ) {
+ qs->added = added_save;
+ return -EAGAIN;
+ }
+
+ dma_len = (~((unsigned) dma_addr) & 0xfff) + 1;
+ if( dma_len > len ) dma_len = len;
+ { /* BUG5391_WORKAROUND */
+ unsigned misalign =
+ (unsigned) dma_addr & q->misalign_mask;
+ if( misalign && dma_len + misalign > 512 )
+ dma_len = 512 - misalign;
+ }
+
+ di = qs->added++ & q->mask;
+ dp = (ef_vi_falcon_dma_tx_buf_desc*) q->descriptors + di;
+ if( vi->vi_flags & EF_VI_TX_PHYS_ADDR )
+ falcon_dma_tx_calc_ip_phys
+ (ef_physaddr(dma_addr), dma_len, /*port*/ 0,
+ (iov_len == 1 && dma_len == len) ? 0 :
+ EFVI_FALCON_DMA_TX_FRAG, dp);
+ else
+ falcon_dma_tx_calc_ip_buf
+ (ef_bufaddr(dma_addr), dma_len, /*port*/ 0,
+ (iov_len == 1 && dma_len == len) ? 0 :
+ EFVI_FALCON_DMA_TX_FRAG, dp);
+
+ dma_addr += dma_len;
+ len -= dma_len;
+
+ if( len == 0 ) {
+ if( --iov_len == 0 ) break;
+ ++iov;
+ dma_addr = iov->iov_base;
+ len = iov->iov_len;
+ if( (vi->vi_flags & EF_VI_ISCSI_TX_DDIG) &&
+ (iov_len == 1) )
+ len = last_len;
+ }
+ }
+
+ q->ids[di] = (uint16_t) dma_id;
+ return 0;
+}
+
+
+void ef_vi_transmit_push(ef_vi* vi)
+{
+ ef_vi_wiob();
+ writel((vi->ep_state->txq.added & vi->vi_txq.mask) <<
+ __DW4(TX_DESC_WPTR_LBN),
+ vi->vi_txq.doorbell);
+}
+
+
+/*! The value of initial_rx_bytes is used to set RX_KER_BUF_SIZE in an initial
+** receive descriptor here if physical addressing is being used. A value of
+** zero represents 16384 bytes. This is okay, because caller must provide a
+** buffer than is > MTU, and mac should filter anything bigger than that.
+*/
+int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
+ int initial_rx_bytes)
+{
+ ef_vi_rxq* q = &vi->vi_rxq;
+ ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+ unsigned di;
+
+ if( ef_vi_receive_space(vi) ) {
+ di = qs->added++ & q->mask;
+ ef_assert_equal(q->ids[di], 0xffff);
+ q->ids[di] = (uint16_t) dma_id;
+
+ if( ! (vi->vi_flags & EF_VI_RX_PHYS_ADDR) ) {
+ ef_vi_falcon_dma_rx_buf_desc* dp;
+ dp = (ef_vi_falcon_dma_rx_buf_desc*)
+ q->descriptors + di;
+ falcon_dma_rx_calc_ip_buf(ef_bufaddr(addr), dp);
+ }
+ else {
+ ef_vi_falcon_dma_rx_phys_desc* dp;
+ dp = (ef_vi_falcon_dma_rx_phys_desc*)
+ q->descriptors + di;
+ __falcon_dma_rx_calc_ip_phys(addr, dp,
+ initial_rx_bytes);
+ }
+
+ return 0;
+ }
+
+ return -EAGAIN;
+}
+
+
+void ef_vi_receive_push(ef_vi* vi)
+{
+ ef_vi_wiob();
+ writel ((vi->ep_state->rxq.added & vi->vi_rxq.mask) <<
+ __DW4(RX_DESC_WPTR_LBN),
+ vi->vi_rxq.doorbell);
+}
+
+
+ef_request_id ef_vi_receive_done(const ef_vi* vi, const ef_event* ef_ev)
+{
+ const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
+ unsigned di = ev->u32[0] & vi->vi_rxq.mask;
+ ef_request_id rq_id;
+
+ ef_assert(EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX ||
+ EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX_DISCARD);
+
+ /* Detect spurious / duplicate RX events. We may need to modify this
+ ** code so that we are robust if they happen. */
+ ef_assert_equal(di, vi->ep_state->rxq.removed & vi->vi_rxq.mask);
+
+ /* We only support 1 port: so events should be in order. */
+ ef_assert(vi->vi_rxq.ids[di] != 0xffff);
+
+ rq_id = vi->vi_rxq.ids[di];
+ vi->vi_rxq.ids[di] = 0xffff;
+ ++vi->ep_state->rxq.removed;
+ return rq_id;
+}
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Packet-mode transmit interface.
+ * \date 2003/04/02
+ */
+
+/*! \cidoxg_lib_ef */
+#include "ef_vi_internal.h"
+
+
+int ef_vi_transmit_init(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
+{
+ ef_iovec iov = { base, len };
+ return ef_vi_transmitv_init(vi, &iov, 1, dma_id);
+}
+
+
+int ef_vi_transmit(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
+{
+ ef_iovec iov = { base, len };
+ int rc = ef_vi_transmitv_init(vi, &iov, 1, dma_id);
+ if( rc == 0 ) ef_vi_transmit_push(vi);
+ return rc;
+}
+
+
+int ef_vi_transmitv(ef_vi* vi, const ef_iovec* iov, int iov_len,
+ ef_request_id dma_id)
+{
+ int rc = ef_vi_transmitv_init(vi, iov, iov_len, dma_id);
+ if( rc == 0 ) ef_vi_transmit_push(vi);
+ return rc;
+}
+
+
+int ef_vi_transmit_unbundle(ef_vi* vi, const ef_event* __ev,
+ ef_request_id* ids)
+{
+ ef_request_id* ids_in = ids;
+ ef_vi_txq* q = &vi->vi_txq;
+ ef_vi_txq_state* qs = &vi->ep_state->txq;
+ const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*__ev);
+ unsigned i, stop = (ev->u32[0] + 1) & q->mask;
+
+ ef_assert(EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX ||
+ EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX_ERROR);
+
+ /* Shouldn't be batching more than 64 descriptors, and should not go
+ ** backwards. */
+ ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask), 64);
+ /* Should not complete more than we've posted. */
+ ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask),
+ qs->added - qs->removed);
+
+ for( i = qs->removed & q->mask; i != stop; i = ++qs->removed & q->mask )
+ if( q->ids[i] != 0xffff ) {
+ *ids++ = q->ids[i];
+ q->ids[i] = 0xffff;
+ }
+
+ ef_assert_le(ids - ids_in, EF_VI_TRANSMIT_BATCH);
+
+ return (int) (ids - ids_in);
+}
+
+/*! \cidoxg_end */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author stg
+ * \brief System dependent support for ef vi lib
+ * \date 2007/05/10
+ */
+
+/*! \cidoxg_include_ci_ul */
+#ifndef __CI_CIUL_SYSDEP_LINUX_H__
+#define __CI_CIUL_SYSDEP_LINUX_H__
+
+/**********************************************************************
+ * Kernel version compatability
+ */
+
+#if defined(__GNUC__)
+
+/* Linux kernel doesn't have stdint.h or [u]intptr_t. */
+# if !defined(LINUX_VERSION_CODE)
+# include <linux/version.h>
+# endif
+# include <asm/io.h>
+
+/* In Linux 2.6.24, linux/types.h has uintptr_t */
+# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+# if BITS_PER_LONG == 32
+ typedef __u32 uintptr_t;
+# else
+ typedef __u64 uintptr_t;
+# endif
+# endif
+
+/* But even 2.6.24 doesn't define intptr_t */
+# if BITS_PER_LONG == 32
+ typedef __s32 intptr_t;
+# else
+ typedef __s64 intptr_t;
+# endif
+
+# if defined(__ia64__)
+# define EF_VI_PRIx64 "lx"
+# else
+# define EF_VI_PRIx64 "llx"
+# endif
+
+# define EF_VI_HF __attribute__((visibility("hidden")))
+# define EF_VI_HV __attribute__((visibility("hidden")))
+
+# if defined(__i386__) || defined(__x86_64__) /* GCC x86/x64 */
+ typedef unsigned long long ef_vi_dma_addr_t;
+# if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+# define ef_vi_wiob() __asm__ __volatile__ ("sfence")
+# else
+# define ef_vi_wiob() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF8")
+# endif
+
+# endif
+#endif
+
+#ifdef EFX_NOT_UPSTREAM
+
+/* Stuff for architectures/compilers not officially supported */
+
+#if !defined(__GNUC__)
+# if defined(__PPC__) /* GCC, PPC */
+ typedef unsigned long ef_vi_dma_addr_t;
+# define ef_vi_wiob() wmb()
+
+# ifdef __powerpc64__
+# ifdef CONFIG_SMP
+# define CI_SMP_SYNC "\n eieio \n" /* memory cache sync */
+# define CI_SMP_ISYNC "\n isync \n" /* instr cache sync */
+# else
+# define CI_SMP_SYNC
+# define CI_SMP_ISYNC
+# endif
+# else /* for ppc32 systems */
+# ifdef CONFIG_SMP
+# define CI_SMP_SYNC "\n eieio \n"
+# define CI_SMP_ISYNC "\n sync \n"
+# else
+# define CI_SMP_SYNC
+# define CI_SMP_ISYNC
+# endif
+# endif
+
+# elif defined(__ia64__) /* GCC, IA64 */
+ typedef unsigned long ef_vi_dma_addr_t;
+# define ef_vi_wiob() __asm__ __volatile__("mf.a": : :"memory")
+
+# else
+# error Unknown processor - GNU C
+# endif
+
+#elif defined(__PGI)
+# error PGI not supported
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# if __INTEL_COMPILER >= 700
+# if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
+# define EF_VI_LIKELY(t) __builtin_expect((t), 1)
+# define EF_VI_UNLIKELY(t) __builtin_expect((t), 0)
+# endif
+
+# if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+# define ef_vi_wiob() __asm__ __volatile__ ("sfence")
+# else
+# define ef_vi_wiob() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF8")
+# endif
+
+# else
+# error Old Intel compiler not supported.
+# endif
+
+#else
+# error Unknown compiler.
+#endif
+
+#endif
+
+
+# include <linux/errno.h>
+
+
+/**********************************************************************
+ * Extracting bit fields.
+ */
+
+#define _QWORD_GET_LOW(f, v) \
+ (((v).u32[0] >> (f##_LBN)) & ((1u << f##_WIDTH) - 1u))
+#define _QWORD_GET_HIGH(f, v) \
+ (((v).u32[1] >> (f##_LBN - 32u)) & ((1u << f##_WIDTH) - 1u))
+#define _QWORD_GET_ANY(f, v) \
+ (((v).u64[0] >> f##_LBN) & (((uint64_t) 1u << f##_WIDTH) - 1u))
+
+#define QWORD_GET(f, v) \
+ ((f##_LBN + f##_WIDTH) <= 32u \
+ ? _QWORD_GET_LOW(f, (v)) \
+ : ((f##_LBN >= 32u) ? _QWORD_GET_HIGH(f, (v)) : _QWORD_GET_ANY(f, (v))))
+
+#define QWORD_GET_U(f, v) ((unsigned) QWORD_GET(f, (v)))
+
+#define _QWORD_TEST_BIT_LOW(f, v) ((v).u32[0] & (1u << (f##_LBN)))
+#define _QWORD_TEST_BIT_HIGH(f, v) ((v).u32[1] & (1u << (f##_LBN - 32u)))
+
+#define QWORD_TEST_BIT(f, v) \
+ (f##_LBN < 32 ? _QWORD_TEST_BIT_LOW(f, (v)) : _QWORD_TEST_BIT_HIGH(f, (v)))
+
+
+
+
+#ifndef DECLSPEC_NORETURN
+/* normally defined on Windows to expand to a declaration that the
+ function will not return */
+# define DECLSPEC_NORETURN
+#endif
+
+#endif /* __CI_CIUL_SYSDEP_LINUX_H__ */
--- /dev/null
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ * <linux-xen-drivers@solarflare.com>
+ * <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * \author djr
+ * \brief Initialisation of VIs.
+ * \date 2007/06/08
+ */
+
+#include "ef_vi_internal.h"
+
+#define EF_VI_STATE_BYTES(rxq_sz, txq_sz) \
+ (sizeof(ef_vi_state) + (rxq_sz) * sizeof(uint16_t) \
+ + (txq_sz) * sizeof(uint16_t))
+
+int ef_vi_calc_state_bytes(int rxq_sz, int txq_sz)
+{
+ ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
+ ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
+
+ return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
+}
+
+
+int ef_vi_state_bytes(ef_vi* vi)
+{
+ int rxq_sz = 0, txq_sz = 0;
+ if( ef_vi_receive_capacity(vi) )
+ rxq_sz = ef_vi_receive_capacity(vi) + 1;
+ if( ef_vi_transmit_capacity(vi) )
+ txq_sz = ef_vi_transmit_capacity(vi) + 1;
+
+ ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
+ ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
+
+ return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
+}
+
+
+void ef_eventq_state_init(ef_vi* evq)
+{
+ int j;
+
+ for (j = 0; j<EFAB_DMAQS_PER_EVQ_MAX; j++) {
+ ef_rx_dup_state_t *rx_dup_state =
+ &evq->evq_state->rx_dup_state[j];
+ rx_dup_state->bad_sop = 0;
+ rx_dup_state->rx_last_desc_ptr = -1;
+ rx_dup_state->frag_num = 0;
+ }
+
+ evq->evq_state->evq_ptr = 0;
+}
+
+
+void ef_vi_state_init(ef_vi* vi)
+{
+ ef_vi_state* state = vi->ep_state;
+ unsigned i;
+
+ state->txq.added = state->txq.removed = 0;
+ state->rxq.added = state->rxq.removed = 0;
+
+ if( vi->vi_rxq.mask )
+ for( i = 0; i <= vi->vi_rxq.mask; ++i )
+ vi->vi_rxq.ids[i] = (uint16_t) -1;
+ if( vi->vi_txq.mask )
+ for( i = 0; i <= vi->vi_txq.mask; ++i )
+ vi->vi_txq.ids[i] = (uint16_t) -1;
+}
+
+
+void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type nic_type,
+ int instance, unsigned evq_bytes, void* base,
+ void* timer_reg)
+{
+ struct vi_mappings* vm = (struct vi_mappings*) data_area;
+
+ vm->signature = VI_MAPPING_SIGNATURE;
+ vm->vi_instance = instance;
+ vm->nic_type = nic_type;
+ vm->evq_bytes = evq_bytes;
+ vm->evq_base = base;
+ vm->evq_timer_reg = timer_reg;
+}
+
+
+void ef_vi_init(ef_vi* vi, void* vvis, ef_vi_state* state,
+ ef_eventq_state* evq_state, enum ef_vi_flags vi_flags)
+{
+ struct vi_mappings* vm = (struct vi_mappings*) vvis;
+
+ vi->vi_i = vm->vi_instance;
+ vi->ep_state = state;
+ vi->vi_flags = vi_flags;
+
+ switch( vm->nic_type.arch ) {
+ case EF_VI_ARCH_FALCON:
+ falcon_vi_init(vi, vvis);
+ break;
+ default:
+ /* ?? TODO: We should return an error code. */
+ ef_assert(0);
+ break;
+ }
+
+ if( vm->evq_bytes ) {
+ vi->evq_state = evq_state;
+ vi->evq_mask = vm->evq_bytes - 1u;
+ vi->evq_base = vm->evq_base;
+ vi->evq_timer_reg = vm->evq_timer_reg;
+ }
+
+ EF_VI_MAGIC_SET(vi, EF_VI);
+}
+
+
+/* Initialise [data_area] with information required to initialise an ef_vi.
+ * In the following, an unused param should be set to NULL. Note the case
+ * marked (*) of [iobuf_mmap] for falcon/driver; for the normal driver this
+ * must be NULL.
+ *
+ * \param data_area [in,out] required, must ref at least VI_MAPPING_SIZE
+ * bytes
+ * \param io_mmap [in] ef1, required
+ * falcon, required
+ * \param iobuf_mmap [in] ef1, unused
+ * falcon, required
+ */
+void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type nic_type,
+ unsigned rxq_capacity, unsigned txq_capacity,
+ int instance, void* io_mmap,
+ void* iobuf_mmap_rx, void* iobuf_mmap_tx,
+ enum ef_vi_flags vi_flags)
+{
+ struct vi_mappings* vm = (struct vi_mappings*) data_area;
+ int rx_desc_bytes, rxq_bytes;
+
+ ef_assert(rxq_capacity > 0 || txq_capacity > 0);
+ ef_assert(vm);
+ ef_assert(io_mmap);
+ ef_assert(iobuf_mmap_rx || iobuf_mmap_tx);
+
+ vm->signature = VI_MAPPING_SIGNATURE;
+ vm->vi_instance = instance;
+ vm->nic_type = nic_type;
+
+ rx_desc_bytes = (vi_flags & EF_VI_RX_PHYS_ADDR) ? 8 : 4;
+ rxq_bytes = rxq_capacity * rx_desc_bytes;
+ rxq_bytes = (rxq_bytes + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+
+ if( iobuf_mmap_rx == iobuf_mmap_tx )
+ iobuf_mmap_tx = (char*) iobuf_mmap_rx + rxq_bytes;
+
+ vm->rx_queue_capacity = rxq_capacity;
+ vm->rx_dma_falcon = iobuf_mmap_rx;
+ vm->rx_bell = (char*) io_mmap + (RX_DESC_UPD_REG_KER_OFST & 4095);
+ vm->tx_queue_capacity = txq_capacity;
+ vm->tx_dma_falcon = iobuf_mmap_tx;
+ vm->tx_bell = (char*) io_mmap + (TX_DESC_UPD_REG_KER_OFST & 4095);
+}
--- /dev/null
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netutil
+EXTRA_CFLAGS += -Werror
+
+ifdef GGOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) := sfc_netutil.o
+
+sfc_netutil-objs := accel_cuckoo_hash.o accel_msg_iface.o accel_util.o
+
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/types.h> /* needed for linux/random.h */
+#include <linux/random.h>
+
+#include "accel_cuckoo_hash.h"
+#include "accel_util.h"
+
+static inline int cuckoo_hash_key_compare(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key1,
+ cuckoo_hash_key *key2)
+{
+ return !memcmp(key1, key2, hashtab->key_length);
+}
+
+
+static inline void cuckoo_hash_key_set(cuckoo_hash_key *key1,
+ cuckoo_hash_key *key2)
+{
+ *key1 = *key2;
+}
+
+
+/*
+ * Sets hash function parameters. Chooses "a" to be odd, 0 < a < 2^w
+ * where w is the length of the key
+ */
+static void set_hash_parameters(cuckoo_hash_table *hashtab)
+{
+ again:
+ hashtab->a0 = hashtab->a1 = 0;
+
+ /* Make sure random */
+ get_random_bytes(&hashtab->a0, hashtab->key_length);
+ get_random_bytes(&hashtab->a1, hashtab->key_length);
+
+ /* Make sure odd */
+ hashtab->a0 |= 1;
+ hashtab->a1 |= 1;
+
+ /* Being different is good */
+ if (hashtab->a0 != hashtab->a1)
+ return;
+
+ goto again;
+}
+
+int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
+ unsigned key_length)
+{
+ char *table_mem;
+ unsigned length = 1 << length_bits;
+
+ BUG_ON(length_bits >= sizeof(unsigned) * 8);
+ BUG_ON(key_length > sizeof(cuckoo_hash_key));
+
+ table_mem = kmalloc(sizeof(cuckoo_hash_entry) * 2 * length, GFP_KERNEL);
+
+ if (table_mem == NULL)
+ return -ENOMEM;
+
+ hashtab->length = length;
+ hashtab->length_bits = length_bits;
+ hashtab->key_length = key_length;
+ hashtab->entries = 0;
+
+ hashtab->table0 = (cuckoo_hash_entry *)table_mem;
+ hashtab->table1 = (cuckoo_hash_entry *)
+ (table_mem + length * sizeof(cuckoo_hash_entry));
+
+ set_hash_parameters(hashtab);
+
+ /* Zero the table */
+ memset(hashtab->table0, 0, length * 2 * sizeof(cuckoo_hash_entry));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_init);
+
+void cuckoo_hash_destroy(cuckoo_hash_table *hashtab)
+{
+ if (hashtab->table0 != NULL)
+ kfree(hashtab->table0);
+}
+
+EXPORT_SYMBOL_GPL(cuckoo_hash_destroy);
+
+/*
+ * This computes sizeof(cuckoo_hash) bits of hash, not all will be
+ * necessarily used, but the hash function throws away any that
+ * aren't
+ */
+static inline void cuckoo_compute_hash_helper(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *a,
+ cuckoo_hash_key *x,
+ cuckoo_hash *result)
+{
+ u64 multiply_result = 0, a_temp, x_temp;
+ u32 carry = 0;
+ u32 *a_words;
+ u32 *x_words;
+ int i;
+
+ /*
+ * As the mod and div operations in the function effectively
+ * reduce and shift the bits of the product down to just the
+ * third word, we need only compute that and return it as a
+ * result.
+ *
+ * Do enough long multiplication to get the word we need
+ */
+
+ /* This assumes things about the sizes of the key and hash */
+ BUG_ON(hashtab->key_length % sizeof(u32) != 0);
+ BUG_ON(sizeof(cuckoo_hash) != sizeof(u32));
+
+ a_words = (u32 *)a;
+ x_words = (u32 *)x;
+
+ for (i = 0; i < hashtab->key_length / sizeof(u32); i++) {
+ a_temp = a_words[i];
+ x_temp = x_words[i];
+
+ multiply_result = (a_temp * x_temp) + carry;
+ carry = (multiply_result >> 32) & 0xffffffff;
+ }
+
+ *result = multiply_result & 0xffffffff;
+}
+
+
+/*
+ * Want to implement (ax mod 2^w) div 2^(w-q) for odd a, 0 < a < 2^w;
+ * w is the length of the key, q is the length of the hash, I think.
+ * See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf
+ */
+static cuckoo_hash cuckoo_compute_hash(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key,
+ cuckoo_hash_key *a)
+{
+ unsigned q = hashtab->length_bits;
+ unsigned shift = 32 - q;
+ unsigned mask = ((1 << q) - 1) << shift;
+ cuckoo_hash hash;
+
+ cuckoo_compute_hash_helper(hashtab, a, key, &hash);
+
+ /*
+ * Take the top few bits to get the right length for this
+ * hash table
+ */
+ hash = (hash & mask) >> shift;
+
+ BUG_ON(hash >= hashtab->length);
+
+ return hash;
+}
+
+
+static int cuckoo_hash_lookup0(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key,
+ cuckoo_hash_value *value)
+{
+ cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+
+ if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
+ && cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+ key)) {
+ *value = hashtab->table0[hash].value;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int cuckoo_hash_lookup1(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key,
+ cuckoo_hash_value *value)
+{
+ cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+
+ if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
+ && cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+ key)) {
+ *value = hashtab->table1[hash].value;
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int cuckoo_hash_lookup(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+ cuckoo_hash_value *value)
+{
+ return cuckoo_hash_lookup0(hashtab, key, value)
+ || cuckoo_hash_lookup1(hashtab, key, value);
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_lookup);
+
+
+/* Transfer any active entries from "old_table" into hashtab */
+static int cuckoo_hash_transfer_entries(cuckoo_hash_table *hashtab,
+ cuckoo_hash_entry *old_table,
+ unsigned capacity)
+{
+ int i, rc;
+ cuckoo_hash_entry *entry;
+
+ hashtab->entries = 0;
+
+ for (i = 0; i < capacity; i++) {
+ entry = &old_table[i];
+ if (entry->state == CUCKOO_HASH_STATE_OCCUPIED) {
+ rc = cuckoo_hash_add(hashtab, &(entry->key),
+ entry->value, 0);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+int cuckoo_hash_rehash(cuckoo_hash_table *hashtab)
+{
+ cuckoo_hash_entry *new_table;
+ cuckoo_hash_table old_hashtab;
+ int resize = 0, rc, rehash_count;
+
+ /*
+ * Store old tables so we can access the existing values and
+ * copy across
+ */
+ memcpy(&old_hashtab, hashtab, sizeof(cuckoo_hash_table));
+
+ /* resize if hashtable is more than half full */
+ if (old_hashtab.entries > old_hashtab.length &&
+ old_hashtab.length_bits < 32)
+ resize = 1;
+
+ resize:
+ if (resize) {
+ new_table = kmalloc(sizeof(cuckoo_hash_entry) * 4 * hashtab->length,
+ GFP_ATOMIC);
+ if (new_table == NULL) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ hashtab->length = 2 * hashtab->length;
+ hashtab->length_bits++;
+ } else {
+ new_table = kmalloc(sizeof(cuckoo_hash_entry) * 2 * hashtab->length,
+ GFP_ATOMIC);
+ if (new_table == NULL) {
+ rc = -ENOMEM;
+ goto err;
+ }
+ }
+
+ /*
+ * Point hashtab to new memory region so we can try to
+ * construct new table
+ */
+ hashtab->table0 = new_table;
+ hashtab->table1 = (cuckoo_hash_entry *)
+ ((char *)new_table + hashtab->length * sizeof(cuckoo_hash_entry));
+
+ rehash_count = 0;
+
+ again:
+ /* Zero the new tables */
+ memset(new_table, 0, hashtab->length * 2 * sizeof(cuckoo_hash_entry));
+
+ /* Choose new parameters for the hash functions */
+ set_hash_parameters(hashtab);
+
+ /*
+ * Multiply old_table_length by 2 as the length refers to each
+ * table, and there are two of them. This assumes that they
+ * are arranged sequentially in memory, so assert it
+ */
+ BUG_ON(((char *)old_hashtab.table1) !=
+ ((char *)old_hashtab.table0 + old_hashtab.length
+ * sizeof(cuckoo_hash_entry)));
+ rc = cuckoo_hash_transfer_entries(hashtab, old_hashtab.table0,
+ old_hashtab.length * 2);
+ if (rc < 0) {
+ /* Problem */
+ if (rc == -ENOSPC) {
+ ++rehash_count;
+ if (rehash_count < CUCKOO_HASH_MAX_LOOP) {
+ /*
+ * Wanted to rehash, but rather than
+ * recurse we can just do it here
+ */
+ goto again;
+ } else {
+ /*
+ * Didn't manage to rehash, so let's
+ * go up a size (if we haven't already
+ * and there's space)
+ */
+ if (!resize && hashtab->length_bits < 32) {
+ resize = 1;
+ kfree(new_table);
+ goto resize;
+ }
+ else
+ goto err;
+ }
+ }
+ else
+ goto err;
+ }
+
+ /* Success, I think. Free up the old table */
+ kfree(old_hashtab.table0);
+
+ /* We should have put all the entries from old table in the new one */
+ BUG_ON(hashtab->entries != old_hashtab.entries);
+
+ return 0;
+ err:
+ EPRINTK("%s: Rehash failed, giving up\n", __FUNCTION__);
+ /* Some other error, give up, at least restore table to how it was */
+ memcpy(hashtab, &old_hashtab, sizeof(cuckoo_hash_table));
+ if (new_table)
+ kfree(new_table);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_rehash);
+
+
+static int
+cuckoo_hash_insert_or_displace(cuckoo_hash_entry *table, unsigned hash,
+ cuckoo_hash_key *key,
+ cuckoo_hash_value value,
+ cuckoo_hash_key *displaced_key,
+ cuckoo_hash_value *displaced_value)
+{
+ if (table[hash].state == CUCKOO_HASH_STATE_VACANT) {
+ cuckoo_hash_key_set(&(table[hash].key), key);
+ table[hash].value = value;
+ table[hash].state = CUCKOO_HASH_STATE_OCCUPIED;
+
+ return 1;
+ } else {
+ cuckoo_hash_key_set(displaced_key, &(table[hash].key));
+ *displaced_value = table[hash].value;
+ cuckoo_hash_key_set(&(table[hash].key), key);
+ table[hash].value = value;
+
+ return 0;
+ }
+}
+
+
+int cuckoo_hash_add(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+ cuckoo_hash_value value, int can_rehash)
+{
+ cuckoo_hash hash0, hash1;
+ int i, rc;
+ cuckoo_hash_key key1, key2;
+
+ cuckoo_hash_key_set(&key1, key);
+
+ again:
+ i = 0;
+ do {
+ hash0 = cuckoo_compute_hash(hashtab, &key1, &hashtab->a0);
+ if (cuckoo_hash_insert_or_displace(hashtab->table0, hash0,
+ &key1, value, &key2,
+ &value)) {
+ /* Success */
+ hashtab->entries++;
+ return 0;
+ }
+
+ hash1 = cuckoo_compute_hash(hashtab, &key2, &hashtab->a1);
+ if (cuckoo_hash_insert_or_displace(hashtab->table1, hash1,
+ &key2, value, &key1,
+ &value)) {
+ /* Success */
+ hashtab->entries++;
+ return 0;
+ }
+ } while (++i < CUCKOO_HASH_MAX_LOOP);
+
+ if (can_rehash) {
+ if ((rc = cuckoo_hash_rehash(hashtab)) < 0) {
+ /*
+ * Give up - this will drop whichever
+ * key/value pair we have currently displaced
+ * on the floor
+ */
+ return rc;
+ }
+ goto again;
+ }
+
+ EPRINTK("%s: failed hash add\n", __FUNCTION__);
+ /*
+ * Couldn't do it - bad as we've now removed some random thing
+ * from the table, and will just drop it on the floor. Better
+ * would be to somehow revert the table to the state it was in
+ * at the start
+ */
+ return -ENOSPC;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_add);
+
+
+int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key, cuckoo_hash_value value,
+ int can_rehash)
+{
+ int stored_value;
+
+ if (cuckoo_hash_lookup(hashtab, key, &stored_value))
+ return -EBUSY;
+
+ return cuckoo_hash_add(hashtab, key, value, can_rehash);
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_add_check);
+
+
+int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key)
+{
+ cuckoo_hash hash;
+
+ hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+ if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+ cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+ key)) {
+ hashtab->table0[hash].state = CUCKOO_HASH_STATE_VACANT;
+ hashtab->entries--;
+ return 0;
+ }
+
+ hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+ if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+ cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+ key)) {
+ hashtab->table1[hash].state = CUCKOO_HASH_STATE_VACANT;
+ hashtab->entries--;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_remove);
+
+
+int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+ cuckoo_hash_value value)
+{
+ cuckoo_hash hash;
+
+ hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+ if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+ cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+ key)) {
+ hashtab->table0[hash].value = value;
+ return 0;
+ }
+
+ hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+ if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+ cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+ key)) {
+ hashtab->table1[hash].value = value;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_update);
+
+
+void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab)
+{
+ hashtab->iterate_index = 0;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_iterate_reset);
+
+
+int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key, cuckoo_hash_value *value)
+{
+ unsigned index;
+
+ while (hashtab->iterate_index < hashtab->length) {
+ index = hashtab->iterate_index;
+ ++hashtab->iterate_index;
+ if (hashtab->table0[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
+ *key = hashtab->table0[index].key;
+ *value = hashtab->table0[index].value;
+ return 0;
+ }
+ }
+
+ while (hashtab->iterate_index >= hashtab->length &&
+ hashtab->iterate_index < hashtab->length * 2) {
+ index = hashtab->iterate_index - hashtab->length;
+ ++hashtab->iterate_index;
+ if (hashtab->table1[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
+ *key = hashtab->table1[index].key;
+ *value = hashtab->table1[index].value;
+ return 0;
+ }
+ }
+
+ return -ENOSPC;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_iterate);
+
+
+#if 0
+void cuckoo_hash_valid(cuckoo_hash_table *hashtab)
+{
+ int i, entry_count = 0;
+
+ for (i=0; i < hashtab->length; i++) {
+ EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
+ hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+ if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ entry_count++;
+ EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
+ hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+ if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ entry_count++;
+ }
+
+ if (entry_count != hashtab->entries) {
+ EPRINTK("%s: bad count\n", __FUNCTION__);
+ cuckoo_hash_dump(hashtab);
+ return;
+ }
+
+ for (i=0; i< hashtab->length; i++) {
+ if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ if (i != cuckoo_compute_hash(hashtab,
+ &hashtab->table0[i].key,
+ &hashtab->a0)) {
+ EPRINTK("%s: Bad key table 0 index %d\n",
+ __FUNCTION__, i);
+ cuckoo_hash_dump(hashtab);
+ return;
+ }
+ if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ if (i != cuckoo_compute_hash(hashtab,
+ &hashtab->table1[i].key,
+ &hashtab->a1)) {
+ EPRINTK("%s: Bad key table 1 index %d\n",
+ __FUNCTION__, i);
+ cuckoo_hash_dump(hashtab);
+ return;
+ }
+ }
+
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_valid);
+
+
+void cuckoo_hash_dump(cuckoo_hash_table *hashtab)
+{
+ int i, entry_count;
+
+ entry_count = 0;
+ for (i=0; i < hashtab->length; i++) {
+ EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
+ hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+ if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ entry_count++;
+ EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
+ hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+ if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ entry_count++;
+ }
+
+ EPRINTK("======================\n");
+ EPRINTK("Cuckoo hash table dump\n");
+ EPRINTK("======================\n");
+ EPRINTK("length: %d; length_bits: %d; key_length: %d\n", hashtab->length,
+ hashtab->length_bits, hashtab->key_length);
+ EPRINTK("Recorded entries: %d\n", hashtab->entries);
+ EPRINTK("Counted entries: %d\n", entry_count);
+ EPRINTK("a0: %llx; a1: %llx\n", hashtab->a0, hashtab->a1);
+ EPRINTK("-----------------------------------------\n");
+ EPRINTK("Index Occupied Key Value Index0 Index1\n");
+ EPRINTK("-----------------------------------------\n");
+ for (i=0; i< hashtab->length; i++) {
+ if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ EPRINTK("%d %d %llx %d %d %d\n", i,
+ hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED,
+ hashtab->table0[i].key, hashtab->table0[i].value,
+ cuckoo_compute_hash(hashtab, &hashtab->table0[i].key,
+ &hashtab->a0),
+ cuckoo_compute_hash(hashtab, &hashtab->table0[i].key,
+ &hashtab->a1));
+ else
+ EPRINTK("%d %d - - - -\n", i,
+ hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED);
+
+ }
+ EPRINTK("-----------------------------------------\n");
+ EPRINTK("Index Occupied Key Value Index0 Index1\n");
+ EPRINTK("-----------------------------------------\n");
+ for (i=0; i< hashtab->length; i++) {
+ if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+ EPRINTK("%d %d %llx %d %d %d\n", i,
+ hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED,
+ hashtab->table1[i].key, hashtab->table1[i].value,
+ cuckoo_compute_hash(hashtab, &hashtab->table1[i].key,
+ &hashtab->a0),
+ cuckoo_compute_hash(hashtab, &hashtab->table1[i].key,
+ &hashtab->a1));
+ else
+ EPRINTK("%d %d - - - -\n", i,
+ hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED);
+ }
+ EPRINTK("======================\n");
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_dump);
+#endif
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+/*
+ * A cuckoo hash table consists of two sub tables. Each entry can
+ * hash to a position in each table. If, on entry, its position is
+ * found to be occupied, the existing element is moved to it's other
+ * location. This recurses until success or a loop is found. If a
+ * loop is found the table is rehashed.
+ *
+ * See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf
+ */
+
+#ifndef NET_ACCEL_CUCKOO_HASH_H
+#define NET_ACCEL_CUCKOO_HASH_H
+
+/*! Type used for hash table keys of ip pairs */
+typedef struct {
+ u32 local_ip;
+ //u32 remote_ip;
+ u16 local_port;
+ //u16 remote_port;
+ /* Technically only 1 bit, but use 16 to make key a round
+ number size */
+ u16 proto;
+} cuckoo_hash_ip_key;
+
+/*! Type used for hash table keys of mac addresses */
+typedef u64 cuckoo_hash_mac_key;
+
+/*! This type is designed to be large enough to hold all supported key
+ * sizes to avoid having to malloc storage for them.
+ */
+typedef u64 cuckoo_hash_key;
+
+/*! Type used for the values stored in the hash table */
+typedef int cuckoo_hash_value;
+
+/*! Type used for the hash used to index the table */
+typedef u32 cuckoo_hash;
+
+/*! How long to spend displacing values when adding before giving up
+ * and rehashing */
+#define CUCKOO_HASH_MAX_LOOP (hashtab->length)
+
+/*! State of hash table entry */
+typedef enum {
+ CUCKOO_HASH_STATE_VACANT = 0,
+ CUCKOO_HASH_STATE_OCCUPIED
+} cuckoo_hash_state;
+
+/*! An entry in the hash table */
+typedef struct {
+ cuckoo_hash_state state;
+ cuckoo_hash_key key;
+ cuckoo_hash_value value;
+} cuckoo_hash_entry;
+
+/*! A cuckoo hash table */
+typedef struct {
+ /*! The length of each table (NB. there are two tables of this
+ * length) */
+ unsigned length;
+ /*! The length of each table in bits */
+ unsigned length_bits;
+ /*! The length of the key in bytes */
+ unsigned key_length;
+ /*! The number of entries currently stored in the table */
+ unsigned entries;
+ /*! Index into table used by cuckoo_hash_iterate */
+ unsigned iterate_index;
+
+ /* parameter of hash functions */
+ /*! The "a" parameter of the first hash function */
+ cuckoo_hash_key a0;
+ /*! The "a" parameter of the second hash function */
+ cuckoo_hash_key a1;
+
+ /*! The first table */
+ cuckoo_hash_entry *table0;
+ /*! The second table */
+ cuckoo_hash_entry *table1;
+} cuckoo_hash_table;
+
+/*! Initialise the cuckoo has table
+ *
+ * \param hashtab A pointer to an unitialised hash table structure
+ * \param length_bits The number of elements in each table equals
+ * 2**length_bits
+ * \param key_length The length of the key in bytes
+ *
+ * \return 0 on success, -ENOMEM if it couldn't allocate the tables
+ */
+extern
+int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
+ unsigned key_length);
+
+
+/*! Destroy a hash table
+ *
+ * \param hashtab A hash table that has previously been passed to a
+ * successful call of cuckoo_hash_init()
+ */
+extern
+void cuckoo_hash_destroy(cuckoo_hash_table *hashtab);
+
+
+/*! Lookup an entry in the hash table
+ *
+ * \param hashtab The hash table in which to look.
+ * \param key Pointer to a mac address to use as the key
+ * \param value On exit set to the value stored if key was present
+ *
+ * \return 0 if not present in the table, non-zero if it is (and value
+ * is set accordingly)
+ */
+extern
+int cuckoo_hash_lookup(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key,
+ cuckoo_hash_value *value);
+
+/*! Add an entry to the hash table. Key must not be a duplicate of
+ * anything already in the table. If this is a risk, see
+ * cuckoo_hash_add_check
+ *
+ * \param hashtab The hash table to add the entry to
+ * \param key Pointer to a mac address to use as a key
+ * \param value The value to store
+ * \param can_rehash Flag to allow the add function to rehash the
+ * table if necessary
+ *
+ * \return 0 on success, non-zero on failure. -ENOSPC means it just
+ * couldn't find anywhere to put it - this is bad and probably means
+ * an entry has been dropped on the floor (but the entry you just
+ * tried to add may now be included)
+ */
+extern
+int cuckoo_hash_add(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key,
+ cuckoo_hash_value value,
+ int can_rehash);
+
+/*! Same as cuckoo_hash_add but first checks to ensure entry is not
+ * already there
+ * \return -EBUSY if already there
+ */
+
+extern
+int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key,
+ cuckoo_hash_value value,
+ int can_rehash);
+/*! Remove an entry from the table
+ *
+ * \param hashtab The hash table to remove the entry from
+ * \param key The key that was used to previously add the entry
+ *
+ * \return 0 on success, -EINVAL if the entry couldn't be found
+ */
+extern
+int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key);
+
+
+/*! Helper for those using mac addresses to convert to a key for the
+ * hash table
+ */
+static inline cuckoo_hash_mac_key cuckoo_mac_to_key(const u8 *mac)
+{
+ return (cuckoo_hash_mac_key)(mac[0])
+ | (cuckoo_hash_mac_key)(mac[1]) << 8
+ | (cuckoo_hash_mac_key)(mac[2]) << 16
+ | (cuckoo_hash_mac_key)(mac[3]) << 24
+ | (cuckoo_hash_mac_key)(mac[4]) << 32
+ | (cuckoo_hash_mac_key)(mac[5]) << 40;
+}
+
+
+/*! Update an entry already in the hash table to take a new value
+ *
+ * \param hashtab The hash table to add the entry to
+ * \param key Pointer to a mac address to use as a key
+ * \param value The value to store
+ *
+ * \return 0 on success, non-zero on failure.
+ */
+int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+ cuckoo_hash_value value);
+
+
+/*! Go through the hash table and return all used entries (one per call)
+ *
+ * \param hashtab The hash table to iterate over
+ * \param key Pointer to a key to take the returned key
+ * \param value Pointer to a value to take the returned value
+ *
+ * \return 0 on success (key, value set), non-zero on failure.
+ */
+int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
+ cuckoo_hash_key *key, cuckoo_hash_value *value);
+void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab);
+
+/* debug, not compiled by default */
+void cuckoo_hash_valid(cuckoo_hash_table *hashtab);
+void cuckoo_hash_dump(cuckoo_hash_table *hashtab);
+
+#endif /* NET_ACCEL_CUCKOO_HASH_H */
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+
+#define NET_ACCEL_MSG_Q_SIZE (1024)
+#define NET_ACCEL_MSG_Q_MASK (NET_ACCEL_MSG_Q_SIZE - 1)
+
+#ifdef NDEBUG
+#define NET_ACCEL_CHECK_MAGIC(_p, _errval)
+#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id)
+#else
+#define NET_ACCEL_CHECK_MAGIC(_p, _errval) \
+ if (_p->magic != NET_ACCEL_MSG_MAGIC) { \
+ printk(KERN_ERR "%s: passed invalid shared page %p!\n", \
+ __FUNCTION__, _p); \
+ return _errval; \
+ }
+#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id) \
+ printk(_t ": queue %d write %x read %x base %x limit %x\n", \
+ _id, _q->write, _q->read, _q->base, _q->limit);
+#endif
+
+/*
+ * We've been passed at least 2 pages. 1 control page and 1 or more
+ * data pages.
+ */
+int net_accel_msg_init_page(void *mem, int len, int up)
+{
+ struct net_accel_shared_page *shared_page =
+ (struct net_accel_shared_page*)mem;
+
+ if ((unsigned long)shared_page & NET_ACCEL_MSG_Q_MASK)
+ return -EINVAL;
+
+ shared_page->magic = NET_ACCEL_MSG_MAGIC;
+
+ shared_page->aflags = 0;
+
+ shared_page->net_dev_up = up;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_init_page);
+
+
+void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
+ struct net_accel_msg_queue *indices,
+ struct net_accel_msg *base, int size)
+{
+ queue->fifo = base;
+ spin_lock_init(&queue->lock);
+ sh_fifo2_init(queue, size-1, &indices->read, &indices->write);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_init_queue);
+
+
+static inline int _net_accel_msg_send(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ struct net_accel_msg *msg,
+ int is_reply)
+{
+ int rc = 0;
+ NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+ rmb();
+ if (is_reply) {
+ EPRINTK_ON(sh_fifo2_is_full(queue));
+ sh_fifo2_put(queue, *msg);
+ } else {
+ if (sh_fifo2_not_half_full(queue)) {
+ sh_fifo2_put(queue, *msg);
+ } else {
+ rc = -ENOSPC;
+ }
+ }
+ wmb();
+ return rc;
+}
+
+/* Notify after a batch of messages have been sent */
+void net_accel_msg_notify(int irq)
+{
+ notify_remote_via_irq(irq);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_notify);
+
+/*
+ * Send a message on the specified FIFO. Returns 0 on success, -errno
+ * on failure. The message in msg is copied to the current slot of the
+ * FIFO.
+ */
+int net_accel_msg_send(struct net_accel_shared_page *sp, sh_msg_fifo2 *q,
+ struct net_accel_msg *msg)
+{
+ unsigned long flags;
+ int rc;
+ net_accel_msg_lock_queue(q, &flags);
+ rc = _net_accel_msg_send(sp, q, msg, 0);
+ net_accel_msg_unlock_queue(q, &flags);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_send);
+
+
+/* As net_accel_msg_send but also posts a notification to the far end. */
+int net_accel_msg_send_notify(struct net_accel_shared_page *sp, int irq,
+ sh_msg_fifo2 *q, struct net_accel_msg *msg)
+{
+ unsigned long flags;
+ int rc;
+ net_accel_msg_lock_queue(q, &flags);
+ rc = _net_accel_msg_send(sp, q, msg, 0);
+ net_accel_msg_unlock_queue(q, &flags);
+ if (rc >= 0)
+ notify_remote_via_irq(irq);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_send_notify);
+
+
+int net_accel_msg_reply(struct net_accel_shared_page *sp, sh_msg_fifo2 *q,
+ struct net_accel_msg *msg)
+{
+ unsigned long flags;
+ int rc;
+ net_accel_msg_lock_queue(q, &flags);
+ rc = _net_accel_msg_send(sp, q, msg, 1);
+ net_accel_msg_unlock_queue(q, &flags);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_reply);
+
+
+/* As net_accel_msg_send but also posts a notification to the far end. */
+int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, int irq,
+ sh_msg_fifo2 *q, struct net_accel_msg *msg)
+{
+ unsigned long flags;
+ int rc;
+ net_accel_msg_lock_queue(q, &flags);
+ rc = _net_accel_msg_send(sp, q, msg, 1);
+ net_accel_msg_unlock_queue(q, &flags);
+ if (rc >= 0)
+ notify_remote_via_irq(irq);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_reply_notify);
+
+
+/*
+ * Look at a received message, if any, so a decision can be made about
+ * whether to read it now or not. Cookie is a bit of debug which is
+ * set here and checked when passed to net_accel_msg_recv_next()
+ */
+int net_accel_msg_peek(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ struct net_accel_msg *msg, int *cookie)
+{
+ unsigned long flags;
+ int rc = 0;
+ NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+ net_accel_msg_lock_queue(queue, &flags);
+ rmb();
+ if (sh_fifo2_is_empty(queue)) {
+ rc = -ENOENT;
+ } else {
+ *msg = sh_fifo2_peek(queue);
+ *cookie = *(queue->fifo_rd_i);
+ }
+ net_accel_msg_unlock_queue(queue, &flags);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_peek);
+
+
+/*
+ * Move the queue onto the next element, used after finished with a
+ * peeked msg
+ */
+int net_accel_msg_recv_next(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue, int cookie)
+{
+ unsigned long flags;
+ NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+ net_accel_msg_lock_queue(queue, &flags);
+ rmb();
+ /* Mustn't be empty */
+ BUG_ON(sh_fifo2_is_empty(queue));
+ /*
+ * Check cookie matches, i.e. we're advancing over the same message
+ * as was got using peek
+ */
+ BUG_ON(cookie != *(queue->fifo_rd_i));
+ sh_fifo2_rd_next(queue);
+ wmb();
+ net_accel_msg_unlock_queue(queue, &flags);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_recv_next);
+
+
+/*
+ * Receive a message on the specified FIFO. Returns 0 on success,
+ * -errno on failure.
+ */
+int net_accel_msg_recv(struct net_accel_shared_page *sp, sh_msg_fifo2 *queue,
+ struct net_accel_msg *msg)
+{
+ unsigned long flags;
+ int rc = 0;
+ NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+ net_accel_msg_lock_queue(queue, &flags);
+ rmb();
+ if (sh_fifo2_is_empty(queue)) {
+ rc = -ENOENT;
+ } else {
+ sh_fifo2_get(queue, msg);
+ }
+ wmb();
+ net_accel_msg_unlock_queue(queue, &flags);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_recv);
+
+
+/*
+ * Start sending a message without copying. returns a pointer to a message
+ * that will be filled out in place. The queue is locked until the message
+ * is sent.
+ */
+struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue, unsigned long *flags)
+{
+ struct net_accel_msg *msg;
+ NET_ACCEL_CHECK_MAGIC(sp, NULL);
+ net_accel_msg_lock_queue(queue, flags);
+ rmb();
+ if (sh_fifo2_not_half_full(queue)) {
+ msg = sh_fifo2_pokep(queue);
+ } else {
+ net_accel_msg_unlock_queue(queue, flags);
+ msg = NULL;
+ }
+ return msg;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_start_send);
+
+
+static inline void _msg_complete(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ unsigned long *flags)
+{
+ sh_fifo2_wr_next(queue);
+ net_accel_msg_unlock_queue(queue, flags);
+}
+
+/*
+ * Complete the sending of a message started with net_accel_msg_start_send. The
+ * message is implicit since the queue was locked by _start
+ */
+void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ unsigned long *flags)
+{
+ _msg_complete(sp, queue, flags);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_complete_send);
+
+/* As net_accel_msg_complete_send but does the notify. */
+void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ unsigned long *flags, int irq)
+{
+ _msg_complete(sp, queue, flags);
+ notify_remote_via_irq(irq);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_complete_send_notify);
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NET_ACCEL_MSG_IFACE_H
+#define NET_ACCEL_MSG_IFACE_H
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include "accel_shared_fifo.h"
+
+#define NET_ACCEL_MSG_MAGIC (0x85465479)
+
+/*! We talk version 0.010 of the interdomain protocol */
+#define NET_ACCEL_MSG_VERSION (0x00001000)
+
+/*! Shared memory portion of inter-domain FIFO */
+struct net_accel_msg_queue {
+ u32 read;
+ u32 write;
+};
+
+
+/*
+ * The aflags in the following structure is used as follows:
+ *
+ * - each bit is set when one of the corresponding variables is
+ * changed by either end.
+ *
+ * - the end that has made the change then forwards an IRQ to the
+ * other
+ *
+ * - the IRQ handler deals with these bits either on the fast path, or
+ * for less common changes, by jumping onto the slow path.
+ *
+ * - once it has seen a change, it clears the relevant bit.
+ *
+ * aflags is accessed atomically using clear_bit, test_bit,
+ * test_and_set_bit etc
+ */
+
+/*
+ * The following used to signify to the other domain when the queue
+ * they want to use is full, and when it is no longer full. Could be
+ * compressed to use fewer bits but done this way for simplicity and
+ * clarity
+ */
+
+/* "dom0->domU queue" is full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL 0x1
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B 0
+/* "dom0->domU queue" is not full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL 0x2
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B 1
+/* "domU->dom0 queue" is full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL 0x4
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B 2
+/* "domU->dom0 queue" is not full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL 0x8
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B 3
+/* dom0 -> domU net_dev up/down events */
+#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN 0x10
+#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B 4
+
+/*
+ * Masks used to test if there are any messages for domU and dom0
+ * respectively
+ */
+#define NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK \
+ (NET_ACCEL_MSG_AFLAGS_QUEUE0FULL | \
+ NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL | \
+ NET_ACCEL_MSG_AFLAGS_NETUPDOWN)
+#define NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK \
+ (NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL | \
+ NET_ACCEL_MSG_AFLAGS_QUEUEUFULL)
+
+/*! The shared data structure used for inter-VM communication. */
+struct net_accel_shared_page {
+ /*! Sanity check */
+ u32 magic;
+ /*! Used by host/Dom0 */
+ struct net_accel_msg_queue queue0;
+ /*! Used by guest/DomU */
+ struct net_accel_msg_queue queue1;
+ /*! Atomic flags, used to communicate simple state changes */
+ u32 aflags;
+ /*! State of net_dev used for acceleration */
+ u32 net_dev_up;
+};
+
+
+enum net_accel_hw_type {
+ /*! Not a virtualisable NIC: use slow path. */
+ NET_ACCEL_MSG_HWTYPE_NONE = 0,
+ /*! NIC is Falcon-based */
+ NET_ACCEL_MSG_HWTYPE_FALCON_A = 1,
+ NET_ACCEL_MSG_HWTYPE_FALCON_B = 2,
+};
+
+/*! The maximum number of pages used by an event queue. */
+#define EF_HW_FALCON_EVQ_PAGES 8
+
+struct net_accel_hw_falcon_b {
+ /* VI */
+ /*! Grant for Tx DMA Q */
+ u32 txdmaq_gnt;
+ /*! Grant for Rx DMA Q */
+ u32 rxdmaq_gnt;
+ /*! Machine frame number for Tx/Rx doorbell page */
+ u32 doorbell_mfn;
+ /*! Grant for Tx/Rx doorbell page */
+ u32 doorbell_gnt;
+
+ /* Event Q */
+ /*! Grants for the pages of the EVQ */
+ u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES];
+ u32 evq_offs;
+ /*! log2(pages in event Q) */
+ u32 evq_order;
+ /*! Capacity in events */
+ u32 evq_capacity;
+ /*! Eventq pointer register physical address */
+ u32 evq_rptr;
+ /*! Interface instance */
+ u32 instance;
+ /*! Capacity of RX queue */
+ u32 rx_capacity;
+ /*! Capacity of TX queue */
+ u32 tx_capacity;
+
+ /* NIC */
+ s32 nic_arch;
+ s32 nic_revision;
+ u8 nic_variant;
+};
+
+struct net_accel_hw_falcon_a {
+ struct net_accel_hw_falcon_b common;
+ u32 evq_rptr_gnt;
+};
+
+
+/*! Description of the hardware that the DomU is being given. */
+struct net_accel_msg_hw {
+ u32 type; /*!< Hardware type */
+ union {
+ struct net_accel_hw_falcon_a falcon_a;
+ struct net_accel_hw_falcon_b falcon_b;
+ } resources;
+};
+
+/*! Start-of-day handshake message. Dom0 fills in its version and
+ * sends, DomU checks, inserts its version and replies
+ */
+struct net_accel_msg_hello {
+ /*! Sender's version (set by each side in turn) */
+ u32 version;
+ /*! max pages allocated/allowed for buffers */
+ u32 max_pages;
+};
+
+/*! Maximum number of page requests that can fit in a message. */
+#define NET_ACCEL_MSG_MAX_PAGE_REQ (8)
+
+/*! Request for NIC buffers. DomU fils out pages and grants (and
+ * optionally) reqid, dom0 fills out buf and sends reply
+ */
+struct net_accel_msg_map_buffers {
+ u32 reqid; /*!< Optional request ID */
+ u32 pages; /*!< Number of pages to map */
+ u32 grants[NET_ACCEL_MSG_MAX_PAGE_REQ]; /*!< Grant ids to map */
+ u32 buf; /*!< NIC buffer address of pages obtained */
+};
+
+/*! Notification of a change to local mac address, used to filter
+ locally destined packets off the fast path */
+struct net_accel_msg_localmac {
+ u32 flags; /*!< Should this be added or removed? */
+ u8 mac[ETH_ALEN]; /*!< The mac address to filter onto slow path */
+};
+
+struct net_accel_msg_fastpath {
+ u32 flags; /*!< Should this be added or removed? */
+ u8 mac[ETH_ALEN];/*!< The mac address to filter onto fast path */
+ u16 port; /*!< The port of the connection */
+ u32 ip; /*!< The IP address of the connection */
+ u8 proto; /*!< The protocol of connection (TCP/UDP) */
+};
+
+/*! Values for struct ef_msg_localmac/fastpath.flags */
+#define NET_ACCEL_MSG_ADD 0x1
+#define NET_ACCEL_MSG_REMOVE 0x2
+
+/*! Overall message structure */
+struct net_accel_msg {
+ /*! ID specifying type of messge */
+ u32 id;
+ union {
+ /*! handshake */
+ struct net_accel_msg_hello hello;
+ /*! hardware description */
+ struct net_accel_msg_hw hw;
+ /*! buffer map request */
+ struct net_accel_msg_map_buffers mapbufs;
+ /*! mac address of a local interface */
+ struct net_accel_msg_localmac localmac;
+ /*! address of a new fastpath connection */
+ struct net_accel_msg_fastpath fastpath;
+ /*! make the message a fixed size */
+ u8 pad[128 - sizeof(u32)];
+ } u;
+};
+
+
+#define NET_ACCEL_MSG_HW_TO_MSG(_u) container_of(_u, struct net_accel_msg, u.hw)
+
+/*! Inter-domain message FIFO */
+typedef struct {
+ struct net_accel_msg *fifo;
+ u32 fifo_mask;
+ u32 *fifo_rd_i;
+ u32 *fifo_wr_i;
+ spinlock_t lock;
+ u32 is_locked; /* Debug flag */
+} sh_msg_fifo2;
+
+
+#define NET_ACCEL_MSG_OFFSET_MASK PAGE_MASK
+
+/* Modifiers */
+#define NET_ACCEL_MSG_REPLY (0x80000000)
+#define NET_ACCEL_MSG_ERROR (0x40000000)
+
+/* Dom0 -> DomU and reply. Handshake/version check. */
+#define NET_ACCEL_MSG_HELLO (0x00000001)
+/* Dom0 -> DomU : hardware setup (VI info.) */
+#define NET_ACCEL_MSG_SETHW (0x00000002)
+/*
+ * Dom0 -> DomU. Notification of a local mac to add/remove from slow
+ * path filter
+ */
+#define NET_ACCEL_MSG_LOCALMAC (0x00000003)
+/*
+ * DomU -> Dom0 and reply. Request for buffer table entries for
+ * preallocated pages.
+ */
+#define NET_ACCEL_MSG_MAPBUF (0x00000004)
+/*
+ * Dom0 -> DomU. Notification of a local mac to add/remove from fast
+ * path filter
+ */
+#define NET_ACCEL_MSG_FASTPATH (0x00000005)
+
+/*! Initialise a message and set the type
+ * \param message : the message
+ * \param code : the message type
+ */
+static inline void net_accel_msg_init(struct net_accel_msg *msg, int code) {
+ msg->id = (u32)code;
+}
+
+/*! initialise a shared page structure
+ * \param shared_page : mapped memory in which the structure resides
+ * \param len : size of the message FIFO area that follows
+ * \param up : initial up/down state of netdev
+ * \return 0 or an error code
+ */
+extern int net_accel_msg_init_page(void *shared_page, int len, int up);
+
+/*! initialise a message queue
+ * \param queue : the message FIFO to initialise
+ * \param indices : the read and write indices in shared memory
+ * \param base : the start of the memory area for the FIFO
+ * \param size : the size of the FIFO in bytes
+ */
+extern void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
+ struct net_accel_msg_queue *indices,
+ struct net_accel_msg *base, int size);
+
+/* Notify after a batch of messages have been sent */
+extern void net_accel_msg_notify(int irq);
+
+/*! Send a message on the specified FIFO. The message is copied to the
+ * current slot of the FIFO.
+ * \param sp : pointer to shared page
+ * \param q : pointer to message FIFO to use
+ * \param msg : pointer to message
+ * \return 0 on success, -errno on
+ */
+extern int net_accel_msg_send(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *q,
+ struct net_accel_msg *msg);
+extern int net_accel_msg_reply(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *q,
+ struct net_accel_msg *msg);
+
+/*! As net_accel_msg_send but also posts a notification to the far end. */
+extern int net_accel_msg_send_notify(struct net_accel_shared_page *sp,
+ int irq, sh_msg_fifo2 *q,
+ struct net_accel_msg *msg);
+/*! As net_accel_msg_send but also posts a notification to the far end. */
+extern int net_accel_msg_reply_notify(struct net_accel_shared_page *sp,
+ int irq, sh_msg_fifo2 *q,
+ struct net_accel_msg *msg);
+
+/*! Receive a message on the specified FIFO. Returns 0 on success,
+ * -errno on failure.
+ */
+extern int net_accel_msg_recv(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *q,
+ struct net_accel_msg *msg);
+
+/*! Look at a received message, if any, so a decision can be made
+ * about whether to read it now or not. Cookie is a bit of debug
+ * which is set here and checked when passed to
+ * net_accel_msg_recv_next()
+ */
+extern int net_accel_msg_peek(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ struct net_accel_msg *msg, int *cookie);
+/*! Move the queue onto the next element, used after finished with a
+ * peeked msg
+ */
+extern int net_accel_msg_recv_next(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue, int cookie);
+
+/*! Start sending a message without copying. returns a pointer to a
+ * message that will be filled out in place. The queue is locked
+ * until the message is sent.
+ */
+extern
+struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ unsigned long *flags);
+
+
+/*! Complete the sending of a message started with
+ * net_accel_msg_start_send. The message is implicit since the queue
+ * was locked by _start
+ */
+extern void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ unsigned long *flags);
+
+/*! As net_accel_msg_complete_send but does the notify. */
+extern void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue,
+ unsigned long *flags, int irq);
+
+/*! Lock the queue so that multiple "_locked" functions can be called
+ * without the queue being modified by others
+ */
+static inline
+void net_accel_msg_lock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
+{
+ spin_lock_irqsave(&queue->lock, (*flags));
+ rmb();
+ BUG_ON(queue->is_locked);
+ queue->is_locked = 1;
+}
+
+/*! Unlock the queue */
+static inline
+void net_accel_msg_unlock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
+{
+ BUG_ON(!queue->is_locked);
+ queue->is_locked = 0;
+ wmb();
+ spin_unlock_irqrestore(&queue->lock, (*flags));
+}
+
+/*! Give up without sending a message that was started with
+ * net_accel_msg_start_send()
+ */
+static inline
+void net_accel_msg_abort_send(struct net_accel_shared_page *sp,
+ sh_msg_fifo2 *queue, unsigned long *flags)
+{
+ net_accel_msg_unlock_queue(queue, flags);
+}
+
+/*! Test the queue to ensure there is sufficient space */
+static inline
+int net_accel_msg_check_space(sh_msg_fifo2 *queue, unsigned space)
+{
+ return sh_fifo2_space(queue) >= space;
+}
+
+#endif /* NET_ACCEL_MSG_IFACE_H */
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NET_ACCEL_SHARED_FIFO_H
+#define NET_ACCEL_SHARED_FIFO_H
+
+/*
+ * This is based on fifo.h, but handles sharing between address spaces
+ * that don't trust each other, by splitting out the read and write
+ * indices. This costs at least one pointer indirection more than the
+ * vanilla version per access.
+ */
+
+typedef struct {
+ char* fifo;
+ unsigned fifo_mask;
+ unsigned *fifo_rd_i;
+ unsigned *fifo_wr_i;
+} sh_byte_fifo2;
+
+#define SH_FIFO2_M(f, x) ((x) & ((f)->fifo_mask))
+
+static inline unsigned log2_ge(unsigned long n, unsigned min_order) {
+ unsigned order = min_order;
+ while((1ul << order) < n) ++order;
+ return order;
+}
+
+static inline unsigned long pow2(unsigned order) {
+ return (1ul << order);
+}
+
+#define is_pow2(x) (pow2(log2_ge((x), 0)) == (x))
+
+#define sh_fifo2_valid(f) ((f) && (f)->fifo && (f)->fifo_mask > 0 && \
+ is_pow2((f)->fifo_mask+1u))
+
+#define sh_fifo2_init(f, cap, _rptr, _wptr) \
+ do { \
+ BUG_ON(!is_pow2((cap) + 1)); \
+ (f)->fifo_rd_i = _rptr; \
+ (f)->fifo_wr_i = _wptr; \
+ *(f)->fifo_rd_i = *(f)->fifo_wr_i = 0u; \
+ (f)->fifo_mask = (cap); \
+ } while(0)
+
+#define sh_fifo2_num(f) SH_FIFO2_M((f),*(f)->fifo_wr_i - *(f)->fifo_rd_i)
+#define sh_fifo2_space(f) SH_FIFO2_M((f),*(f)->fifo_rd_i - *(f)->fifo_wr_i-1u)
+#define sh_fifo2_is_empty(f) (sh_fifo2_num(f)==0)
+#define sh_fifo2_not_empty(f) (sh_fifo2_num(f)!=0)
+#define sh_fifo2_is_full(f) (sh_fifo2_space(f)==0u)
+#define sh_fifo2_not_full(f) (sh_fifo2_space(f)!=0u)
+#define sh_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
+#define sh_fifo2_capacity(f) ((f)->fifo_mask)
+#define sh_fifo2_end(f) ((f)->fifo + sh_fifo2_buf_size(f))
+#define sh_fifo2_not_half_full(f) (sh_fifo2_space(f) > (sh_fifo2_capacity(f) >> 1))
+
+#define sh_fifo2_peek(f) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i)])
+#define sh_fifo2_peekp(f) ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_rd_i))
+#define sh_fifo2_poke(f) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i)])
+#define sh_fifo2_pokep(f) ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_wr_i))
+#define sh_fifo2_peek_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i+(i))])
+#define sh_fifo2_poke_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i+(i))])
+
+#define sh_fifo2_rd_next(f) \
+ do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + 1u;} while(0)
+#define sh_fifo2_wr_next(f) \
+ do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + 1u;} while(0)
+#define sh_fifo2_rd_adv(f, n) \
+ do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + (n);} while(0)
+#define sh_fifo2_wr_adv(f, n) \
+ do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + (n);} while(0)
+
+#define sh_fifo2_put(f, v) \
+ do {sh_fifo2_poke(f) = (v); wmb(); sh_fifo2_wr_next(f);} while(0)
+
+#define sh_fifo2_get(f, pv) \
+ do {*(pv) = sh_fifo2_peek(f); mb(); sh_fifo2_rd_next(f);} while(0)
+
+static inline unsigned sh_fifo2_contig_num(sh_byte_fifo2 *f)
+{
+ unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
+ unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
+
+ return (fifo_wr_i >= fifo_rd_i)
+ ? fifo_wr_i - fifo_rd_i
+ : f->fifo_mask + 1u - *(f)->fifo_rd_i;
+}
+
+static inline unsigned sh_fifo2_contig_space(sh_byte_fifo2 *f)
+{
+ unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
+ unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
+
+ return (fifo_rd_i > fifo_wr_i)
+ ? fifo_rd_i - fifo_wr_i - 1
+ : (f->fifo_mask + 1u - fifo_wr_i
+ /*
+ * The last byte can't be used if the read pointer
+ * is at zero.
+ */
+ - (fifo_rd_i==0));
+}
+
+
+#endif /* NET_ACCEL_SHARED_FIFO_H */
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#include <linux/if_ether.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hypercall.h>
+#include <xen/xenbus.h>
+#include <xen/driver_util.h>
+#include <xen/gnttab.h>
+
+#include "accel_util.h"
+
+#ifdef EFX_GCOV
+#include "gcov.h"
+
+static int __init net_accel_init(void)
+{
+ gcov_provider_init(THIS_MODULE);
+ return 0;
+}
+module_init(net_accel_init);
+
+static void __exit net_accel_exit(void)
+{
+ gcov_provider_fini(THIS_MODULE);
+}
+module_exit(net_accel_exit);
+#endif
+
+/* Shutdown remote domain that is misbehaving */
+int net_accel_shutdown_remote(int domain)
+{
+ struct sched_remote_shutdown sched_shutdown = {
+ .domain_id = domain,
+ .reason = SHUTDOWN_crash
+ };
+
+ EPRINTK("Crashing domain %d\n", domain);
+
+ return HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &sched_shutdown);
+}
+EXPORT_SYMBOL(net_accel_shutdown_remote);
+
+
+/* Based on xenbus_backend_client.c:xenbus_map_ring() */
+static int net_accel_map_grant(struct xenbus_device *dev, int gnt_ref,
+ grant_handle_t *handle, void *vaddr,
+ u64 *dev_bus_addr, unsigned flags)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)vaddr, flags,
+ gnt_ref, dev->otherend_id);
+
+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
+
+ if (op.status != GNTST_okay) {
+ xenbus_dev_error
+ (dev, op.status,
+ "failed mapping in shared page %d from domain %d\n",
+ gnt_ref, dev->otherend_id);
+ } else {
+ *handle = op.handle;
+ if (dev_bus_addr)
+ *dev_bus_addr = op.dev_bus_addr;
+ }
+
+ return op.status;
+}
+
+
+/* Based on xenbus_backend_client.c:xenbus_unmap_ring() */
+static int net_accel_unmap_grant(struct xenbus_device *dev,
+ grant_handle_t handle,
+ void *vaddr, u64 dev_bus_addr,
+ unsigned flags)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)vaddr, flags, handle);
+
+ if (dev_bus_addr)
+ op.dev_bus_addr = dev_bus_addr;
+
+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+
+ if (op.status != GNTST_okay)
+ xenbus_dev_error(dev, op.status,
+ "failed unmapping page at handle %d error %d\n",
+ handle, op.status);
+
+ return op.status;
+}
+
+
+int net_accel_map_device_page(struct xenbus_device *dev,
+ int gnt_ref, grant_handle_t *handle,
+ u64 *dev_bus_addr)
+{
+ return net_accel_map_grant(dev, gnt_ref, handle, 0, dev_bus_addr,
+ GNTMAP_device_map);
+}
+EXPORT_SYMBOL_GPL(net_accel_map_device_page);
+
+
+int net_accel_unmap_device_page(struct xenbus_device *dev,
+ grant_handle_t handle, u64 dev_bus_addr)
+{
+ return net_accel_unmap_grant(dev, handle, 0, dev_bus_addr,
+ GNTMAP_device_map);
+}
+EXPORT_SYMBOL_GPL(net_accel_unmap_device_page);
+
+
+struct net_accel_valloc_grant_mapping {
+ struct vm_struct *vm;
+ int pages;
+ grant_handle_t grant_handles[0];
+};
+
+/* Map a series of grants into a contiguous virtual area */
+static void *net_accel_map_grants_valloc(struct xenbus_device *dev,
+ unsigned *grants, int npages,
+ unsigned flags, void **priv)
+{
+ struct net_accel_valloc_grant_mapping *map;
+ struct vm_struct *vm;
+ void *addr;
+ int i, j, rc;
+
+ vm = alloc_vm_area(PAGE_SIZE * npages);
+ if (vm == NULL) {
+ EPRINTK("No memory from alloc_vm_area.\n");
+ return NULL;
+ }
+ /*
+ * Get a structure in which we will record all the info needed
+ * to undo the mapping.
+ */
+ map = kzalloc(sizeof(struct net_accel_valloc_grant_mapping) +
+ npages * sizeof(grant_handle_t), GFP_KERNEL);
+ if (map == NULL) {
+ EPRINTK("No memory for net_accel_valloc_grant_mapping\n");
+ free_vm_area(vm);
+ return NULL;
+ }
+ map->vm = vm;
+ map->pages = npages;
+
+ /* Do the actual mapping */
+ addr = vm->addr;
+ for (i = 0; i < npages; i++) {
+ rc = net_accel_map_grant(dev, grants[i], map->grant_handles + i,
+ addr, NULL, flags);
+ if (rc != 0)
+ goto undo;
+ addr = (void*)((unsigned long)addr + PAGE_SIZE);
+ }
+
+ if (priv)
+ *priv = (void *)map;
+ else
+ kfree(map);
+
+ return vm->addr;
+
+ undo:
+ EPRINTK("Aborting contig map due to single map failure %d (%d of %d)\n",
+ rc, i+1, npages);
+ for (j = 0; j < i; j++) {
+ addr = (void*)((unsigned long)vm->addr + (j * PAGE_SIZE));
+ net_accel_unmap_grant(dev, map->grant_handles[j], addr, 0,
+ flags);
+ }
+ free_vm_area(vm);
+ kfree(map);
+ return NULL;
+}
+
+/* Undo the result of the mapping */
+static void net_accel_unmap_grants_vfree(struct xenbus_device *dev,
+ unsigned flags, void *priv)
+{
+ struct net_accel_valloc_grant_mapping *map =
+ (struct net_accel_valloc_grant_mapping *)priv;
+
+ void *addr = map->vm->addr;
+ int npages = map->pages;
+ int i;
+
+ for (i = 0; i < npages; i++) {
+ net_accel_unmap_grant(dev, map->grant_handles[i], addr, 0,
+ flags);
+ addr = (void*)((unsigned long)addr + PAGE_SIZE);
+ }
+ free_vm_area(map->vm);
+ kfree(map);
+}
+
+
+void *net_accel_map_grants_contig(struct xenbus_device *dev,
+ unsigned *grants, int npages,
+ void **priv)
+{
+ return net_accel_map_grants_valloc(dev, grants, npages,
+ GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_map_grants_contig);
+
+
+void net_accel_unmap_grants_contig(struct xenbus_device *dev,
+ void *priv)
+{
+ net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_unmap_grants_contig);
+
+
+void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
+ void **priv)
+{
+ return net_accel_map_grants_valloc(dev, &gnt_ref, 1,
+ GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_map_iomem_page);
+
+
+void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv)
+{
+ net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_unmap_iomem_page);
+
+
+int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn,
+ int is_iomem)
+{
+ int err = gnttab_grant_foreign_access(dev->otherend_id, mfn,
+ is_iomem ? GTF_PCD : 0);
+ if (err < 0)
+ xenbus_dev_error(dev, err, "failed granting access to page\n");
+ return err;
+}
+EXPORT_SYMBOL_GPL(net_accel_grant_page);
+
+
+int net_accel_ungrant_page(grant_ref_t gntref)
+{
+ if (unlikely(gnttab_query_foreign_access(gntref) != 0)) {
+ EPRINTK("%s: remote domain still using grant %d\n", __FUNCTION__,
+ gntref);
+ return -EBUSY;
+ }
+
+ gnttab_end_foreign_access(gntref, 0);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_ungrant_page);
+
+
+int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+ char *s, *e, *macstr;
+ int i;
+
+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+ if (IS_ERR(macstr))
+ return PTR_ERR(macstr);
+
+ for (i = 0; i < ETH_ALEN; i++) {
+ mac[i] = simple_strtoul(s, &e, 16);
+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+ kfree(macstr);
+ return -ENOENT;
+ }
+ s = e+1;
+ }
+
+ kfree(macstr);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_xen_net_read_mac);
+
+
+void net_accel_update_state(struct xenbus_device *dev, int state)
+{
+ struct xenbus_transaction tr;
+ int err;
+
+ DPRINTK("%s: setting accelstate to %s\n", __FUNCTION__,
+ xenbus_strstate(state));
+
+ if (xenbus_exists(XBT_NIL, dev->nodename, "")) {
+ VPRINTK("%s: nodename %s\n", __FUNCTION__, dev->nodename);
+ again:
+ err = xenbus_transaction_start(&tr);
+ if (err == 0)
+ err = xenbus_printf(tr, dev->nodename, "accelstate",
+ "%d", state);
+ if (err != 0) {
+ xenbus_transaction_end(tr, 1);
+ } else {
+ err = xenbus_transaction_end(tr, 0);
+ if (err == -EAGAIN)
+ goto again;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(net_accel_update_state);
+
+MODULE_LICENSE("GPL");
--- /dev/null
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ * 9501 Jeronimo Road, Suite 250,
+ * Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_UTIL_H
+#define NETBACK_ACCEL_UTIL_H
+
+#ifdef DPRINTK
+#undef DPRINTK
+#endif
+
+#define FILE_LEAF strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
+
+#if 1
+#define VPRINTK(_f, _a...)
+#else
+#define VPRINTK(_f, _a...) \
+ printk("(file=%s, line=%d) " _f, \
+ FILE_LEAF , __LINE__ , ## _a )
+#endif
+
+#if 1
+#define DPRINTK(_f, _a...)
+#else
+#define DPRINTK(_f, _a...) \
+ printk("(file=%s, line=%d) " _f, \
+ FILE_LEAF , __LINE__ , ## _a )
+#endif
+
+#define EPRINTK(_f, _a...) \
+ printk("(file=%s, line=%d) " _f, \
+ FILE_LEAF , __LINE__ , ## _a )
+
+#define EPRINTK_ON(exp) \
+ do { \
+ if (exp) \
+ EPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
+ } while(0)
+
+#define DPRINTK_ON(exp) \
+ do { \
+ if (exp) \
+ DPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
+ } while(0)
+
+#include <xen/xenbus.h>
+
+/*! Map a set of pages from another domain
+ * \param dev The xenbus device context
+ * \param priv The private data returned by the mapping function
+ */
+extern
+void *net_accel_map_grants_contig(struct xenbus_device *dev,
+ unsigned *grants, int npages,
+ void **priv);
+
+/*! Unmap a set of pages mapped using net_accel_map_grants_contig.
+ * \param dev The xenbus device context
+ * \param priv The private data returned by the mapping function
+ */
+extern
+void net_accel_unmap_grants_contig(struct xenbus_device *dev, void *priv);
+
+/*! Read the MAC address of a device from xenstore */
+extern
+int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[]);
+
+/*! Update the accelstate field for a device in xenstore */
+extern
+void net_accel_update_state(struct xenbus_device *dev, int state);
+
+/* These four map/unmap functions are based on
+ * xenbus_backend_client.c:xenbus_map_ring(). However, they are not
+ * used for ring buffers, instead just to map pages between domains,
+ * or to map a page so that it is accessible by a device
+ */
+extern
+int net_accel_map_device_page(struct xenbus_device *dev,
+ int gnt_ref, grant_handle_t *handle,
+ u64 *dev_bus_addr);
+extern
+int net_accel_unmap_device_page(struct xenbus_device *dev,
+ grant_handle_t handle, u64 dev_bus_addr);
+extern
+void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
+ void **priv);
+extern
+void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv);
+
+/*! Grrant a page to remote domain */
+extern
+int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn,
+ int is_iomem);
+/*! Undo a net_accel_grant_page */
+extern
+int net_accel_ungrant_page(grant_ref_t gntref);
+
+
+/*! Shutdown remote domain that is misbehaving */
+extern
+int net_accel_shutdown_remote(int domain);
+
+
+#endif
--- /dev/null
+
+obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmbk.o
+
+tpmbk-y += tpmback.o interface.o xenbus.o
--- /dev/null
+/******************************************************************************
+ * drivers/xen/tpmback/common.h
+ */
+
+#ifndef __TPM__BACKEND__COMMON_H__
+#define __TPM__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <xen/evtchn.h>
+#include <xen/driver_util.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/tpmif.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+
+#define DPRINTK(_f, _a...) \
+ pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+
+struct backend_info;
+
+typedef struct tpmif_st {
+ struct list_head tpmif_list;
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+
+ /* Physical parameters of the comms window. */
+ unsigned int irq;
+
+ /* The shared rings and indexes. */
+ tpmif_tx_interface_t *tx;
+ struct vm_struct *tx_area;
+
+ /* Miscellaneous private stuff. */
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+ int active;
+
+ struct tpmif_st *hash_next;
+ struct list_head list; /* scheduling list */
+ atomic_t refcnt;
+
+ struct backend_info *bi;
+
+ grant_handle_t shmem_handle;
+ grant_ref_t shmem_ref;
+ struct page **mmap_pages;
+
+ char devname[20];
+} tpmif_t;
+
+void tpmif_disconnect_complete(tpmif_t * tpmif);
+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
+int tpmif_interface_init(void);
+void tpmif_interface_exit(void);
+void tpmif_schedule_work(tpmif_t * tpmif);
+void tpmif_deschedule_work(tpmif_t * tpmif);
+int tpmif_xenbus_init(void);
+void tpmif_xenbus_exit(void);
+int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
+irqreturn_t tpmif_be_int(int irq, void *dev_id);
+
+long int tpmback_get_instance(struct backend_info *bi);
+
+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
+
+
+#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define tpmif_put(_b) \
+ do { \
+ if (atomic_dec_and_test(&(_b)->refcnt)) \
+ tpmif_disconnect_complete(_b); \
+ } while (0)
+
+extern int num_frontends;
+
+static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx)
+{
+ return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx]));
+}
+
+#endif /* __TPMIF__BACKEND__COMMON_H__ */
--- /dev/null
+ /*****************************************************************************
+ * drivers/xen/tpmback/interface.c
+ *
+ * Vritual TPM interface management.
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ *
+ * This code has been derived from drivers/xen/netback/interface.c
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <linux/err.h>
+#include <xen/balloon.h>
+#include <xen/gnttab.h>
+
+static struct kmem_cache *tpmif_cachep;
+int num_frontends = 0;
+
+LIST_HEAD(tpmif_list);
+
+static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
+{
+ tpmif_t *tpmif;
+
+ tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL);
+ if (tpmif == NULL)
+ goto out_of_memory;
+
+ memset(tpmif, 0, sizeof (*tpmif));
+ tpmif->domid = domid;
+ tpmif->status = DISCONNECTED;
+ tpmif->bi = bi;
+ snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
+ atomic_set(&tpmif->refcnt, 1);
+
+ tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE);
+ if (tpmif->mmap_pages == NULL)
+ goto out_of_memory;
+
+ list_add(&tpmif->tpmif_list, &tpmif_list);
+ num_frontends++;
+
+ return tpmif;
+
+ out_of_memory:
+ if (tpmif != NULL)
+ kmem_cache_free(tpmif_cachep, tpmif);
+ printk("%s: out of memory\n", __FUNCTION__);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_tpmif(tpmif_t * tpmif)
+{
+ num_frontends--;
+ list_del(&tpmif->tpmif_list);
+ free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE);
+ kmem_cache_free(tpmif_cachep, tpmif);
+}
+
+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
+{
+ tpmif_t *tpmif;
+
+ list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
+ if (tpmif->bi == bi) {
+ if (tpmif->domid == domid) {
+ tpmif_get(tpmif);
+ return tpmif;
+ } else {
+ return ERR_PTR(-EEXIST);
+ }
+ }
+ }
+
+ return alloc_tpmif(domid, bi);
+}
+
+static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)tpmif->tx_area->addr,
+ GNTMAP_host_map, shared_page, tpmif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Grant table operation failure !\n");
+ return op.status;
+ }
+
+ tpmif->shmem_ref = shared_page;
+ tpmif->shmem_handle = op.handle;
+
+ return 0;
+}
+
+static void unmap_frontend_page(tpmif_t *tpmif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)tpmif->tx_area->addr,
+ GNTMAP_host_map, tpmif->shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+}
+
+int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn)
+{
+ int err;
+
+ if (tpmif->irq)
+ return 0;
+
+ if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL)
+ return -ENOMEM;
+
+ err = map_frontend_page(tpmif, shared_page);
+ if (err) {
+ free_vm_area(tpmif->tx_area);
+ return err;
+ }
+
+ tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr;
+ memset(tpmif->tx, 0, PAGE_SIZE);
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ tpmif->domid, evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
+ if (err < 0) {
+ unmap_frontend_page(tpmif);
+ free_vm_area(tpmif->tx_area);
+ return err;
+ }
+ tpmif->irq = err;
+
+ tpmif->shmem_ref = shared_page;
+ tpmif->active = 1;
+
+ return 0;
+}
+
+void tpmif_disconnect_complete(tpmif_t *tpmif)
+{
+ if (tpmif->irq)
+ unbind_from_irqhandler(tpmif->irq, tpmif);
+
+ if (tpmif->tx) {
+ unmap_frontend_page(tpmif);
+ free_vm_area(tpmif->tx_area);
+ }
+
+ free_tpmif(tpmif);
+}
+
+int __init tpmif_interface_init(void)
+{
+ tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
+ 0, 0, NULL);
+ return tpmif_cachep ? 0 : -ENOMEM;
+}
+
+void tpmif_interface_exit(void)
+{
+ kmem_cache_destroy(tpmif_cachep);
+}
--- /dev/null
+/******************************************************************************
+ * drivers/xen/tpmback/tpmback.c
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from drivers/xen/netback/netback.c
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <asm/uaccess.h>
+#include <xen/xenbus.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+
+/* local data structures */
+struct data_exchange {
+ struct list_head pending_pak;
+ struct list_head current_pak;
+ unsigned int copied_so_far;
+ u8 has_opener:1;
+ u8 aborted:1;
+ rwlock_t pak_lock; // protects all of the previous fields
+ wait_queue_head_t wait_queue;
+};
+
+struct vtpm_resp_hdr {
+ uint32_t instance_no;
+ uint16_t tag_no;
+ uint32_t len_no;
+ uint32_t ordinal_no;
+} __attribute__ ((packed));
+
+struct packet {
+ struct list_head next;
+ unsigned int data_len;
+ u8 *data_buffer;
+ tpmif_t *tpmif;
+ u32 tpm_instance;
+ u8 req_tag;
+ u32 last_read;
+ u8 flags;
+ struct timer_list processing_timer;
+};
+
+enum {
+ PACKET_FLAG_DISCARD_RESPONSE = 1,
+};
+
+/* local variables */
+static struct data_exchange dataex;
+
+/* local function prototypes */
+static int _packet_write(struct packet *pak,
+ const char *data, size_t size, int userbuffer);
+static void processing_timeout(unsigned long ptr);
+static int packet_read_shmem(struct packet *pak,
+ tpmif_t * tpmif,
+ u32 offset,
+ char *buffer, int isuserbuffer, u32 left);
+static int vtpm_queue_packet(struct packet *pak);
+
+/***************************************************************
+ Buffer copying fo user and kernel space buffes.
+***************************************************************/
+static inline int copy_from_buffer(void *to,
+ const void *from, unsigned long size,
+ int isuserbuffer)
+{
+ if (isuserbuffer) {
+ if (copy_from_user(to, (void __user *)from, size))
+ return -EFAULT;
+ } else {
+ memcpy(to, from, size);
+ }
+ return 0;
+}
+
+static inline int copy_to_buffer(void *to,
+ const void *from, unsigned long size,
+ int isuserbuffer)
+{
+ if (isuserbuffer) {
+ if (copy_to_user((void __user *)to, from, size))
+ return -EFAULT;
+ } else {
+ memcpy(to, from, size);
+ }
+ return 0;
+}
+
+
+static void dataex_init(struct data_exchange *dataex)
+{
+ INIT_LIST_HEAD(&dataex->pending_pak);
+ INIT_LIST_HEAD(&dataex->current_pak);
+ dataex->has_opener = 0;
+ rwlock_init(&dataex->pak_lock);
+ init_waitqueue_head(&dataex->wait_queue);
+}
+
+/***************************************************************
+ Packet-related functions
+***************************************************************/
+
+static struct packet *packet_find_instance(struct list_head *head,
+ u32 tpm_instance)
+{
+ struct packet *pak;
+ struct list_head *p;
+
+ /*
+ * traverse the list of packets and return the first
+ * one with the given instance number
+ */
+ list_for_each(p, head) {
+ pak = list_entry(p, struct packet, next);
+
+ if (pak->tpm_instance == tpm_instance) {
+ return pak;
+ }
+ }
+ return NULL;
+}
+
+static struct packet *packet_find_packet(struct list_head *head, void *packet)
+{
+ struct packet *pak;
+ struct list_head *p;
+
+ /*
+ * traverse the list of packets and return the first
+ * one with the given instance number
+ */
+ list_for_each(p, head) {
+ pak = list_entry(p, struct packet, next);
+
+ if (pak == packet) {
+ return pak;
+ }
+ }
+ return NULL;
+}
+
+static struct packet *packet_alloc(tpmif_t * tpmif,
+ u32 size, u8 req_tag, u8 flags)
+{
+ struct packet *pak = NULL;
+ pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
+ if (NULL != pak) {
+ if (tpmif) {
+ pak->tpmif = tpmif;
+ pak->tpm_instance = tpmback_get_instance(tpmif->bi);
+ tpmif_get(tpmif);
+ }
+ pak->data_len = size;
+ pak->req_tag = req_tag;
+ pak->last_read = 0;
+ pak->flags = flags;
+
+ /*
+ * cannot do tpmif_get(tpmif); bad things happen
+ * on the last tpmif_put()
+ */
+ init_timer(&pak->processing_timer);
+ pak->processing_timer.function = processing_timeout;
+ pak->processing_timer.data = (unsigned long)pak;
+ }
+ return pak;
+}
+
+static void inline packet_reset(struct packet *pak)
+{
+ pak->last_read = 0;
+}
+
+static void packet_free(struct packet *pak)
+{
+ if (timer_pending(&pak->processing_timer)) {
+ BUG();
+ }
+
+ if (pak->tpmif)
+ tpmif_put(pak->tpmif);
+ kfree(pak->data_buffer);
+ /*
+ * cannot do tpmif_put(pak->tpmif); bad things happen
+ * on the last tpmif_put()
+ */
+ kfree(pak);
+}
+
+
+/*
+ * Write data to the shared memory and send it to the FE.
+ */
+static int packet_write(struct packet *pak,
+ const char *data, size_t size, int isuserbuffer)
+{
+ int rc = 0;
+
+ if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
+ /* Don't send a respone to this packet. Just acknowledge it. */
+ rc = size;
+ } else {
+ rc = _packet_write(pak, data, size, isuserbuffer);
+ }
+
+ return rc;
+}
+
+int _packet_write(struct packet *pak,
+ const char *data, size_t size, int isuserbuffer)
+{
+ /*
+ * Write into the shared memory pages directly
+ * and send it to the front end.
+ */
+ tpmif_t *tpmif = pak->tpmif;
+ grant_handle_t handle;
+ int rc = 0;
+ unsigned int i = 0;
+ unsigned int offset = 0;
+
+ if (tpmif == NULL) {
+ return -EFAULT;
+ }
+
+ if (tpmif->status == DISCONNECTED) {
+ return size;
+ }
+
+ while (offset < size && i < TPMIF_TX_RING_SIZE) {
+ unsigned int tocopy;
+ struct gnttab_map_grant_ref map_op;
+ struct gnttab_unmap_grant_ref unmap_op;
+ tpmif_tx_request_t *tx;
+
+ tx = &tpmif->tx->ring[i].req;
+
+ if (0 == tx->addr) {
+ DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
+ return 0;
+ }
+
+ gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
+ GNTMAP_host_map, tx->ref, tpmif->domid);
+
+ if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ &map_op, 1))) {
+ BUG();
+ }
+
+ handle = map_op.handle;
+
+ if (map_op.status) {
+ DPRINTK(" Grant table operation failure !\n");
+ return 0;
+ }
+
+ tocopy = min_t(size_t, size - offset, PAGE_SIZE);
+
+ if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) |
+ (tx->addr & ~PAGE_MASK)),
+ &data[offset], tocopy, isuserbuffer)) {
+ tpmif_put(tpmif);
+ return -EFAULT;
+ }
+ tx->size = tocopy;
+
+ gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
+ GNTMAP_host_map, handle);
+
+ if (unlikely
+ (HYPERVISOR_grant_table_op
+ (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
+ BUG();
+ }
+
+ offset += tocopy;
+ i++;
+ }
+
+ rc = offset;
+ DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
+ notify_remote_via_irq(tpmif->irq);
+
+ return rc;
+}
+
+/*
+ * Read data from the shared memory and copy it directly into the
+ * provided buffer. Advance the read_last indicator which tells
+ * how many bytes have already been read.
+ */
+static int packet_read(struct packet *pak, size_t numbytes,
+ char *buffer, size_t buffersize, int isuserbuffer)
+{
+ tpmif_t *tpmif = pak->tpmif;
+
+ /*
+ * Read 'numbytes' of data from the buffer. The first 4
+ * bytes are the instance number in network byte order,
+ * after that come the data from the shared memory buffer.
+ */
+ u32 to_copy;
+ u32 offset = 0;
+ u32 room_left = buffersize;
+
+ if (pak->last_read < 4) {
+ /*
+ * copy the instance number into the buffer
+ */
+ u32 instance_no = htonl(pak->tpm_instance);
+ u32 last_read = pak->last_read;
+
+ to_copy = min_t(size_t, 4 - last_read, numbytes);
+
+ if (copy_to_buffer(&buffer[0],
+ &(((u8 *) & instance_no)[last_read]),
+ to_copy, isuserbuffer)) {
+ return -EFAULT;
+ }
+
+ pak->last_read += to_copy;
+ offset += to_copy;
+ room_left -= to_copy;
+ }
+
+ /*
+ * If the packet has a data buffer appended, read from it...
+ */
+
+ if (room_left > 0) {
+ if (pak->data_buffer) {
+ u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
+ u32 last_read = pak->last_read - 4;
+
+ if (copy_to_buffer(&buffer[offset],
+ &pak->data_buffer[last_read],
+ to_copy, isuserbuffer)) {
+ return -EFAULT;
+ }
+ pak->last_read += to_copy;
+ offset += to_copy;
+ } else {
+ offset = packet_read_shmem(pak,
+ tpmif,
+ offset,
+ buffer,
+ isuserbuffer, room_left);
+ }
+ }
+ return offset;
+}
+
+static int packet_read_shmem(struct packet *pak,
+ tpmif_t * tpmif,
+ u32 offset, char *buffer, int isuserbuffer,
+ u32 room_left)
+{
+ u32 last_read = pak->last_read - 4;
+ u32 i = (last_read / PAGE_SIZE);
+ u32 pg_offset = last_read & (PAGE_SIZE - 1);
+ u32 to_copy;
+ grant_handle_t handle;
+
+ tpmif_tx_request_t *tx;
+
+ tx = &tpmif->tx->ring[0].req;
+ /*
+ * Start copying data at the page with index 'index'
+ * and within that page at offset 'offset'.
+ * Copy a maximum of 'room_left' bytes.
+ */
+ to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
+ while (to_copy > 0) {
+ void *src;
+ struct gnttab_map_grant_ref map_op;
+ struct gnttab_unmap_grant_ref unmap_op;
+
+ tx = &tpmif->tx->ring[i].req;
+
+ gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
+ GNTMAP_host_map, tx->ref, tpmif->domid);
+
+ if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ &map_op, 1))) {
+ BUG();
+ }
+
+ if (map_op.status) {
+ DPRINTK(" Grant table operation failure !\n");
+ return -EFAULT;
+ }
+
+ handle = map_op.handle;
+
+ if (to_copy > tx->size) {
+ /*
+ * User requests more than what's available
+ */
+ to_copy = min_t(u32, tx->size, to_copy);
+ }
+
+ DPRINTK("Copying from mapped memory at %08lx\n",
+ (unsigned long)(idx_to_kaddr(tpmif, i) |
+ (tx->addr & ~PAGE_MASK)));
+
+ src = (void *)(idx_to_kaddr(tpmif, i) |
+ ((tx->addr & ~PAGE_MASK) + pg_offset));
+ if (copy_to_buffer(&buffer[offset],
+ src, to_copy, isuserbuffer)) {
+ return -EFAULT;
+ }
+
+ DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
+ tpmif->domid, buffer[offset], buffer[offset + 1],
+ buffer[offset + 2], buffer[offset + 3]);
+
+ gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
+ GNTMAP_host_map, handle);
+
+ if (unlikely
+ (HYPERVISOR_grant_table_op
+ (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
+ BUG();
+ }
+
+ offset += to_copy;
+ pg_offset = 0;
+ last_read += to_copy;
+ room_left -= to_copy;
+
+ to_copy = min_t(u32, PAGE_SIZE, room_left);
+ i++;
+ } /* while (to_copy > 0) */
+ /*
+ * Adjust the last_read pointer
+ */
+ pak->last_read = last_read + 4;
+ return offset;
+}
+
+/* ============================================================
+ * The file layer for reading data from this device
+ * ============================================================
+ */
+static int vtpm_op_open(struct inode *inode, struct file *f)
+{
+ int rc = 0;
+ unsigned long flags;
+
+ write_lock_irqsave(&dataex.pak_lock, flags);
+ if (dataex.has_opener == 0) {
+ dataex.has_opener = 1;
+ } else {
+ rc = -EPERM;
+ }
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+ return rc;
+}
+
+static ssize_t vtpm_op_read(struct file *file,
+ char __user * data, size_t size, loff_t * offset)
+{
+ int ret_size = -ENODATA;
+ struct packet *pak = NULL;
+ unsigned long flags;
+
+ write_lock_irqsave(&dataex.pak_lock, flags);
+ if (dataex.aborted) {
+ dataex.aborted = 0;
+ dataex.copied_so_far = 0;
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+ return -EIO;
+ }
+
+ if (list_empty(&dataex.pending_pak)) {
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+ wait_event_interruptible(dataex.wait_queue,
+ !list_empty(&dataex.pending_pak));
+ write_lock_irqsave(&dataex.pak_lock, flags);
+ dataex.copied_so_far = 0;
+ }
+
+ if (!list_empty(&dataex.pending_pak)) {
+ unsigned int left;
+
+ pak = list_entry(dataex.pending_pak.next, struct packet, next);
+ left = pak->data_len - dataex.copied_so_far;
+ list_del(&pak->next);
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+ DPRINTK("size given by app: %zu, available: %u\n", size, left);
+
+ ret_size = min_t(size_t, size, left);
+
+ ret_size = packet_read(pak, ret_size, data, size, 1);
+
+ write_lock_irqsave(&dataex.pak_lock, flags);
+
+ if (ret_size < 0) {
+ del_singleshot_timer_sync(&pak->processing_timer);
+ packet_free(pak);
+ dataex.copied_so_far = 0;
+ } else {
+ DPRINTK("Copied %d bytes to user buffer\n", ret_size);
+
+ dataex.copied_so_far += ret_size;
+ if (dataex.copied_so_far >= pak->data_len + 4) {
+ DPRINTK("All data from this packet given to app.\n");
+ /* All data given to app */
+
+ del_singleshot_timer_sync(&pak->
+ processing_timer);
+ list_add_tail(&pak->next, &dataex.current_pak);
+ /*
+ * The more fontends that are handled at the same time,
+ * the more time we give the TPM to process the request.
+ */
+ mod_timer(&pak->processing_timer,
+ jiffies + (num_frontends * 60 * HZ));
+ dataex.copied_so_far = 0;
+ } else {
+ list_add(&pak->next, &dataex.pending_pak);
+ }
+ }
+ }
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+ DPRINTK("Returning result from read to app: %d\n", ret_size);
+
+ return ret_size;
+}
+
+/*
+ * Write operation - only works after a previous read operation!
+ */
+static ssize_t vtpm_op_write(struct file *file,
+ const char __user * data, size_t size,
+ loff_t * offset)
+{
+ struct packet *pak;
+ int rc = 0;
+ unsigned int off = 4;
+ unsigned long flags;
+ struct vtpm_resp_hdr vrh;
+
+ /*
+ * Minimum required packet size is:
+ * 4 bytes for instance number
+ * 2 bytes for tag
+ * 4 bytes for paramSize
+ * 4 bytes for the ordinal
+ * sum: 14 bytes
+ */
+ if (size < sizeof (vrh))
+ return -EFAULT;
+
+ if (copy_from_user(&vrh, data, sizeof (vrh)))
+ return -EFAULT;
+
+ /* malformed packet? */
+ if ((off + ntohl(vrh.len_no)) != size)
+ return -EFAULT;
+
+ write_lock_irqsave(&dataex.pak_lock, flags);
+ pak = packet_find_instance(&dataex.current_pak,
+ ntohl(vrh.instance_no));
+
+ if (pak == NULL) {
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+ DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
+ ntohl(vrh.instance_no));
+ return -EFAULT;
+ }
+
+ del_singleshot_timer_sync(&pak->processing_timer);
+ list_del(&pak->next);
+
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+ /*
+ * The first 'offset' bytes must be the instance number - skip them.
+ */
+ size -= off;
+
+ rc = packet_write(pak, &data[off], size, 1);
+
+ if (rc > 0) {
+ /* I neglected the first 4 bytes */
+ rc += off;
+ }
+ packet_free(pak);
+ return rc;
+}
+
+static int vtpm_op_release(struct inode *inode, struct file *file)
+{
+ unsigned long flags;
+
+ vtpm_release_packets(NULL, 1);
+ write_lock_irqsave(&dataex.pak_lock, flags);
+ dataex.has_opener = 0;
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+ return 0;
+}
+
+static unsigned int vtpm_op_poll(struct file *file,
+ struct poll_table_struct *pts)
+{
+ unsigned int flags = POLLOUT | POLLWRNORM;
+
+ poll_wait(file, &dataex.wait_queue, pts);
+ if (!list_empty(&dataex.pending_pak)) {
+ flags |= POLLIN | POLLRDNORM;
+ }
+ return flags;
+}
+
+static const struct file_operations vtpm_ops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .open = vtpm_op_open,
+ .read = vtpm_op_read,
+ .write = vtpm_op_write,
+ .release = vtpm_op_release,
+ .poll = vtpm_op_poll,
+};
+
+static struct miscdevice vtpms_miscdevice = {
+ .minor = 225,
+ .name = "vtpm",
+ .fops = &vtpm_ops,
+};
+
+/***************************************************************
+ Utility functions
+***************************************************************/
+
+static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
+{
+ int rc;
+ static const unsigned char tpm_error_message_fail[] = {
+ 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x0a,
+ 0x00, 0x00, 0x00, 0x09 /* TPM_FAIL */
+ };
+ unsigned char buffer[sizeof (tpm_error_message_fail)];
+
+ memcpy(buffer, tpm_error_message_fail,
+ sizeof (tpm_error_message_fail));
+ /*
+ * Insert the right response tag depending on the given tag
+ * All response tags are '+3' to the request tag.
+ */
+ buffer[1] = req_tag + 3;
+
+ /*
+ * Write the data to shared memory and notify the front-end
+ */
+ rc = packet_write(pak, buffer, sizeof (buffer), 0);
+
+ return rc;
+}
+
+static int _vtpm_release_packets(struct list_head *head,
+ tpmif_t * tpmif, int send_msgs)
+{
+ int aborted = 0;
+ int c = 0;
+ struct packet *pak;
+ struct list_head *pos, *tmp;
+
+ list_for_each_safe(pos, tmp, head) {
+ pak = list_entry(pos, struct packet, next);
+ c += 1;
+
+ if (tpmif == NULL || pak->tpmif == tpmif) {
+ int can_send = 0;
+
+ del_singleshot_timer_sync(&pak->processing_timer);
+ list_del(&pak->next);
+
+ if (pak->tpmif && pak->tpmif->status == CONNECTED) {
+ can_send = 1;
+ }
+
+ if (send_msgs && can_send) {
+ tpm_send_fail_message(pak, pak->req_tag);
+ }
+ packet_free(pak);
+ if (c == 1)
+ aborted = 1;
+ }
+ }
+ return aborted;
+}
+
+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&dataex.pak_lock, flags);
+
+ dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
+ tpmif,
+ send_msgs);
+ _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
+
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+ return 0;
+}
+
+static int vtpm_queue_packet(struct packet *pak)
+{
+ int rc = 0;
+
+ if (dataex.has_opener) {
+ unsigned long flags;
+
+ write_lock_irqsave(&dataex.pak_lock, flags);
+ list_add_tail(&pak->next, &dataex.pending_pak);
+ /* give the TPM some time to pick up the request */
+ mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+ wake_up_interruptible(&dataex.wait_queue);
+ } else {
+ rc = -EFAULT;
+ }
+ return rc;
+}
+
+static int vtpm_receive(tpmif_t * tpmif, u32 size)
+{
+ int rc = 0;
+ unsigned char buffer[10];
+ __be32 *native_size;
+ struct packet *pak = packet_alloc(tpmif, size, 0, 0);
+
+ if (!pak)
+ return -ENOMEM;
+ /*
+ * Read 10 bytes from the received buffer to test its
+ * content for validity.
+ */
+ if (sizeof (buffer) != packet_read(pak,
+ sizeof (buffer), buffer,
+ sizeof (buffer), 0)) {
+ goto failexit;
+ }
+ /*
+ * Reset the packet read pointer so we can read all its
+ * contents again.
+ */
+ packet_reset(pak);
+
+ native_size = (__force __be32 *) (&buffer[4 + 2]);
+ /*
+ * Verify that the size of the packet is correct
+ * as indicated and that there's actually someone reading packets.
+ * The minimum size of the packet is '10' for tag, size indicator
+ * and ordinal.
+ */
+ if (size < 10 ||
+ be32_to_cpu(*native_size) != size ||
+ 0 == dataex.has_opener || tpmif->status != CONNECTED) {
+ rc = -EINVAL;
+ goto failexit;
+ } else {
+ rc = vtpm_queue_packet(pak);
+ if (rc < 0)
+ goto failexit;
+ }
+ return 0;
+
+ failexit:
+ if (pak) {
+ tpm_send_fail_message(pak, buffer[4 + 1]);
+ packet_free(pak);
+ }
+ return rc;
+}
+
+/*
+ * Timeout function that gets invoked when a packet has not been processed
+ * during the timeout period.
+ * The packet must be on a list when this function is invoked. This
+ * also means that once its taken off a list, the timer must be
+ * destroyed as well.
+ */
+static void processing_timeout(unsigned long ptr)
+{
+ struct packet *pak = (struct packet *)ptr;
+ unsigned long flags;
+
+ write_lock_irqsave(&dataex.pak_lock, flags);
+ /*
+ * The packet needs to be searched whether it
+ * is still on the list.
+ */
+ if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
+ pak == packet_find_packet(&dataex.current_pak, pak)) {
+ if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
+ tpm_send_fail_message(pak, pak->req_tag);
+ }
+ /* discard future responses */
+ pak->flags |= PACKET_FLAG_DISCARD_RESPONSE;
+ }
+
+ write_unlock_irqrestore(&dataex.pak_lock, flags);
+}
+
+static void tpm_tx_action(unsigned long unused);
+static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
+
+static struct list_head tpm_schedule_list;
+static spinlock_t tpm_schedule_list_lock;
+
+static inline void maybe_schedule_tx_action(void)
+{
+ smp_mb();
+ tasklet_schedule(&tpm_tx_tasklet);
+}
+
+static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
+{
+ return tpmif->list.next != NULL;
+}
+
+static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
+{
+ spin_lock_irq(&tpm_schedule_list_lock);
+ if (likely(__on_tpm_schedule_list(tpmif))) {
+ list_del(&tpmif->list);
+ tpmif->list.next = NULL;
+ tpmif_put(tpmif);
+ }
+ spin_unlock_irq(&tpm_schedule_list_lock);
+}
+
+static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
+{
+ if (__on_tpm_schedule_list(tpmif))
+ return;
+
+ spin_lock_irq(&tpm_schedule_list_lock);
+ if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
+ list_add_tail(&tpmif->list, &tpm_schedule_list);
+ tpmif_get(tpmif);
+ }
+ spin_unlock_irq(&tpm_schedule_list_lock);
+}
+
+void tpmif_schedule_work(tpmif_t * tpmif)
+{
+ add_to_tpm_schedule_list_tail(tpmif);
+ maybe_schedule_tx_action();
+}
+
+void tpmif_deschedule_work(tpmif_t * tpmif)
+{
+ remove_from_tpm_schedule_list(tpmif);
+}
+
+static void tpm_tx_action(unsigned long unused)
+{
+ struct list_head *ent;
+ tpmif_t *tpmif;
+ tpmif_tx_request_t *tx;
+
+ DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
+
+ while (!list_empty(&tpm_schedule_list)) {
+ /* Get a tpmif from the list with work to do. */
+ ent = tpm_schedule_list.next;
+ tpmif = list_entry(ent, tpmif_t, list);
+ tpmif_get(tpmif);
+ remove_from_tpm_schedule_list(tpmif);
+
+ tx = &tpmif->tx->ring[0].req;
+
+ /* pass it up */
+ vtpm_receive(tpmif, tx->size);
+
+ tpmif_put(tpmif);
+ }
+}
+
+irqreturn_t tpmif_be_int(int irq, void *dev_id)
+{
+ tpmif_t *tpmif = (tpmif_t *) dev_id;
+
+ add_to_tpm_schedule_list_tail(tpmif);
+ maybe_schedule_tx_action();
+ return IRQ_HANDLED;
+}
+
+static int __init tpmback_init(void)
+{
+ int rc;
+
+ if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
+ printk(KERN_ALERT
+ "Could not register misc device for TPM BE.\n");
+ return rc;
+ }
+
+ dataex_init(&dataex);
+
+ spin_lock_init(&tpm_schedule_list_lock);
+ INIT_LIST_HEAD(&tpm_schedule_list);
+
+ rc = tpmif_interface_init();
+ if (!rc) {
+ rc = tpmif_xenbus_init();
+ if (rc)
+ tpmif_interface_exit();
+ }
+ if (rc) {
+ misc_deregister(&vtpms_miscdevice);
+ return rc;
+ }
+
+ printk(KERN_ALERT "Successfully initialized TPM backend driver.\n");
+
+ return 0;
+}
+module_init(tpmback_init);
+
+static void __exit tpmback_exit(void)
+{
+ vtpm_release_packets(NULL, 0);
+ tpmif_xenbus_exit();
+ tpmif_interface_exit();
+ misc_deregister(&vtpms_miscdevice);
+}
+module_exit(tpmback_exit)
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+/* Xenbus code for tpmif backend
+ Copyright (C) 2005 IBM Corporation
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <stdarg.h>
+#include <linux/module.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+struct backend_info
+{
+ struct xenbus_device *dev;
+
+ /* our communications channel */
+ tpmif_t *tpmif;
+
+ long int frontend_id;
+ long int instance; // instance of TPM
+ u8 is_instance_set;// whether instance number has been set
+
+ /* watch front end for changes */
+ struct xenbus_watch backend_watch;
+};
+
+static void maybe_connect(struct backend_info *be);
+static void connect(struct backend_info *be);
+static int connect_ring(struct backend_info *be);
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len);
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state);
+
+long int tpmback_get_instance(struct backend_info *bi)
+{
+ long int res = -1;
+ if (bi && bi->is_instance_set)
+ res = bi->instance;
+ return res;
+}
+
+static int tpmback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev->dev.driver_data;
+
+ if (!be) return 0;
+
+ if (be->backend_watch.node) {
+ unregister_xenbus_watch(&be->backend_watch);
+ kfree(be->backend_watch.node);
+ be->backend_watch.node = NULL;
+ }
+ if (be->tpmif) {
+ be->tpmif->bi = NULL;
+ vtpm_release_packets(be->tpmif, 0);
+ tpmif_put(be->tpmif);
+ be->tpmif = NULL;
+ }
+ kfree(be);
+ dev->dev.driver_data = NULL;
+ return 0;
+}
+
+static int tpmback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+
+ be->is_instance_set = 0;
+ be->dev = dev;
+ dev->dev.driver_data = be;
+
+ err = xenbus_watch_path2(dev, dev->nodename,
+ "instance", &be->backend_watch,
+ backend_changed);
+ if (err) {
+ goto fail;
+ }
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err) {
+ goto fail;
+ }
+ return 0;
+fail:
+ tpmback_remove(dev);
+ return err;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int err;
+ long instance;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_watch);
+ struct xenbus_device *dev = be->dev;
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename,
+ "instance","%li", &instance);
+ if (XENBUS_EXIST_ERR(err)) {
+ return;
+ }
+
+ if (err != 1) {
+ xenbus_dev_fatal(dev, err, "reading instance");
+ return;
+ }
+
+ if (be->is_instance_set == 0) {
+ be->instance = instance;
+ be->is_instance_set = 1;
+ }
+}
+
+
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev->dev.driver_data;
+ int err;
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
+ break;
+
+ case XenbusStateConnected:
+ err = connect_ring(be);
+ if (err) {
+ return;
+ }
+ maybe_connect(be);
+ break;
+
+ case XenbusStateClosing:
+ be->instance = -1;
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateUnknown: /* keep it here */
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ device_unregister(&be->dev->dev);
+ tpmback_remove(dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL,
+ "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+
+static void maybe_connect(struct backend_info *be)
+{
+ if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
+ return;
+
+ connect(be);
+}
+
+
+static void connect(struct backend_info *be)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = be->dev;
+ unsigned long ready = 1;
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(be->dev, err, "starting transaction");
+ return;
+ }
+
+ err = xenbus_printf(xbt, be->dev->nodename,
+ "ready", "%lu", ready);
+ if (err) {
+ xenbus_dev_fatal(be->dev, err, "writing 'ready'");
+ goto abort;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ xenbus_dev_fatal(be->dev, err, "end of transaction");
+
+ err = xenbus_switch_state(dev, XenbusStateConnected);
+ if (!err)
+ be->tpmif->status = CONNECTED;
+ return;
+abort:
+ xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ unsigned long ring_ref;
+ unsigned int evtchn;
+ int err;
+
+ err = xenbus_gather(XBT_NIL, dev->otherend,
+ "ring-ref", "%lu", &ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_error(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ if (!be->tpmif) {
+ be->tpmif = tpmif_find(dev->otherend_id, be);
+ if (IS_ERR(be->tpmif)) {
+ err = PTR_ERR(be->tpmif);
+ be->tpmif = NULL;
+ xenbus_dev_fatal(dev,err,"creating vtpm interface");
+ return err;
+ }
+ }
+
+ if (be->tpmif != NULL) {
+ err = tpmif_map(be->tpmif, ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_error(dev, err,
+ "mapping shared-frame %lu port %u",
+ ring_ref, evtchn);
+ return err;
+ }
+ }
+ return 0;
+}
+
+
+static const struct xenbus_device_id tpmback_ids[] = {
+ { "vtpm" },
+ { "" }
+};
+
+
+static struct xenbus_driver tpmback = {
+ .name = "vtpm",
+ .ids = tpmback_ids,
+ .probe = tpmback_probe,
+ .remove = tpmback_remove,
+ .otherend_changed = frontend_changed,
+};
+
+
+int tpmif_xenbus_init(void)
+{
+ return xenbus_register_backend(&tpmback);
+}
+
+void tpmif_xenbus_exit(void)
+{
+ xenbus_unregister_driver(&tpmback);
+}
--- /dev/null
+#include <linux/err.h>
+#include <linux/module.h>
+#include <xen/driver_util.h>
+
+struct class *get_xen_class(void)
+{
+ static struct class *xen_class;
+
+ if (xen_class)
+ return xen_class;
+
+ xen_class = class_create(THIS_MODULE, "xen");
+ if (IS_ERR(xen_class)) {
+ printk("Failed to create xen sysfs class.\n");
+ xen_class = NULL;
+ }
+
+ return xen_class;
+}
+EXPORT_SYMBOL_GPL(get_xen_class);
-obj-y += xenbus.o
+obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
+obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o
-xenbus-objs =
-xenbus-objs += xenbus_client.o
-xenbus-objs += xenbus_comms.o
-xenbus-objs += xenbus_xs.o
-xenbus-objs += xenbus_probe.o
+xenbus_be-objs =
+xenbus_be-objs += xenbus_backend_client.o
+
+xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+obj-y += $(xenbus-y) $(xenbus-m)
+obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
--- /dev/null
+/******************************************************************************
+ * Backend-client-facing interface for the Xenbus driver. In other words, the
+ * interface between the Xenbus and the device-specific code in the backend
+ * driver.
+ *
+ * Copyright (C) 2005-2006 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+#include <xen/driver_util.h>
+
+/* Based on Rusty Russell's skeleton driver's map_page */
+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref)
+{
+ struct gnttab_map_grant_ref op;
+ struct vm_struct *area;
+
+ area = alloc_vm_area(PAGE_SIZE);
+ if (!area)
+ return ERR_PTR(-ENOMEM);
+
+ gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
+ gnt_ref, dev->otherend_id);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status != GNTST_okay) {
+ free_vm_area(area);
+ xenbus_dev_fatal(dev, op.status,
+ "mapping in shared page %d from domain %d",
+ gnt_ref, dev->otherend_id);
+ BUG_ON(!IS_ERR(ERR_PTR(op.status)));
+ return ERR_PTR(op.status);
+ }
+
+ /* Stuff the handle in an unused field */
+ area->phys_addr = (unsigned long)op.handle;
+
+ return area;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+ grant_handle_t *handle, void *vaddr)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
+ gnt_ref, dev->otherend_id);
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status != GNTST_okay) {
+ xenbus_dev_fatal(dev, op.status,
+ "mapping in shared page %d from domain %d",
+ gnt_ref, dev->otherend_id);
+ } else
+ *handle = op.handle;
+
+ return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring);
+
+
+/* Based on Rusty Russell's skeleton driver's unmap_page */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
+ (grant_handle_t)area->phys_addr);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status == GNTST_okay)
+ free_vm_area(area);
+ else
+ xenbus_dev_error(dev, op.status,
+ "unmapping page at handle %d error %d",
+ (int16_t)area->phys_addr, op.status);
+
+ return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+
+int xenbus_unmap_ring(struct xenbus_device *dev,
+ grant_handle_t handle, void *vaddr)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
+ handle);
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status != GNTST_okay)
+ xenbus_dev_error(dev, op.status,
+ "unmapping page at handle %d error %d",
+ handle, op.status);
+
+ return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+ int rc, val;
+
+ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
+ if (rc != 1)
+ val = 0; /* no online node present */
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+MODULE_LICENSE("Dual BSD/GPL");
* IN THE SOFTWARE.
*/
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <linux/slab.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/driver_util.h>
+#else
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <asm/xen/hypervisor.h>
#include <xen/interface/event_channel.h>
#include <xen/events.h>
#include <xen/grant_table.h>
+#endif
#include <xen/xenbus.h>
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
const char *xenbus_strstate(enum xenbus_state state)
{
static const char *const name[] = {
EXPORT_SYMBOL_GPL(xenbus_watch_path);
+#if defined(CONFIG_XEN) || defined(MODULE)
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+ const char *path2, struct xenbus_watch *watch,
+ void (*callback)(struct xenbus_watch *,
+ const char **, unsigned int))
+{
+ int err;
+ char *state = kasprintf(GFP_KERNEL|__GFP_HIGH, "%s/%s", path, path2);
+ if (!state) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
+ return -ENOMEM;
+ }
+ err = xenbus_watch_path(dev, state, watch, callback);
+
+ if (err)
+ kfree(state);
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path2);
+#else
/**
* xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
* @dev: xenbus device
return err;
}
EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+#endif
/**
}
-static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
- const char *fmt, va_list ap)
+static void _dev_error(struct xenbus_device *dev, int err,
+ const char *fmt, va_list ap)
{
int ret;
unsigned int len;
- char *printf_buffer = NULL;
- char *path_buffer = NULL;
+ char *printf_buffer = NULL, *path_buffer = NULL;
#define PRINTF_BUFFER_SIZE 4096
printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
path_buffer = error_path(dev);
if (path_buffer == NULL) {
- dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
- dev->nodename, printf_buffer);
+ dev_err(&dev->dev,
+ "xenbus: failed to write error node for %s (%s)\n",
+ dev->nodename, printf_buffer);
goto fail;
}
if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
- dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
- dev->nodename, printf_buffer);
+ dev_err(&dev->dev,
+ "xenbus: failed to write error node for %s (%s)\n",
+ dev->nodename, printf_buffer);
goto fail;
}
fail:
- kfree(printf_buffer);
- kfree(path_buffer);
+ if (printf_buffer)
+ kfree(printf_buffer);
+ if (path_buffer)
+ kfree(path_buffer);
}
* Report the given negative errno into the store, along with the given
* formatted message.
*/
-void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
+ ...)
{
va_list ap;
va_start(ap, fmt);
- xenbus_va_dev_error(dev, err, fmt, ap);
+ _dev_error(dev, err, fmt, ap);
va_end(ap);
}
EXPORT_SYMBOL_GPL(xenbus_dev_error);
+
/**
* xenbus_dev_fatal
* @dev: xenbus device
* xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
* closedown of this driver and its peer.
*/
-
-void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
+ ...)
{
va_list ap;
va_start(ap, fmt);
- xenbus_va_dev_error(dev, err, fmt, ap);
+ _dev_error(dev, err, fmt, ap);
va_end(ap);
xenbus_switch_state(dev, XenbusStateClosing);
}
EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
+
/**
* xenbus_grant_ring
* @dev: xenbus device
* @ring_mfn: mfn of ring to grant
-
+ *
* Grant access to the given @ring_mfn to the peer of the given device. Return
* 0 on success, or -errno on error. On error, the device will switch to
* XenbusStateClosing, and the error will be saved in the store.
struct evtchn_alloc_unbound alloc_unbound;
int err;
- alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.dom = DOMID_SELF;
alloc_unbound.remote_dom = dev->otherend_id;
err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
+#if 0 /* !defined(CONFIG_XEN) && !defined(MODULE) */
/**
* Bind to an existing interdomain event channel in another domain. Returns 0
* on success and stores the local port in *port. On error, returns -errno,
return err;
}
EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
+#endif
/**
EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
+#if 0 /* !defined(CONFIG_XEN) && !defined(MODULE) */
/**
* xenbus_map_ring_valloc
* @dev: xenbus device
return op.status;
}
EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+#endif
/**
#include <linux/sched.h>
#include <linux/err.h>
#include <xen/xenbus.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#else
#include <asm/xen/hypervisor.h>
#include <xen/events.h>
#include <xen/page.h>
+#endif
+
#include "xenbus_comms.h"
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
static int xenbus_irq;
+extern void xenbus_probe(struct work_struct *);
+extern int xenstored_ready;
static DECLARE_WORK(probe_work, xenbus_probe);
static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
if (xenbus_irq)
unbind_from_irqhandler(xenbus_irq, &xb_waitq);
+#if defined(CONFIG_XEN) || defined(MODULE)
+ err = bind_caller_port_to_irqhandler(
+#else
err = bind_evtchn_to_irqhandler(
+#endif
xen_store_evtchn, wake_waiting,
0, "xenbus", &xb_waitq);
if (err <= 0) {
--- /dev/null
+/*
+ * xenbus_dev.c
+ *
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+
+#include "xenbus_comms.h"
+
+#include <asm/uaccess.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen_proc.h>
+#include <asm/hypervisor.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+struct xenbus_dev_transaction {
+ struct list_head list;
+ struct xenbus_transaction handle;
+};
+
+struct read_buffer {
+ struct list_head list;
+ unsigned int cons;
+ unsigned int len;
+ char msg[];
+};
+
+struct xenbus_dev_data {
+ /* In-progress transaction. */
+ struct list_head transactions;
+
+ /* Active watches. */
+ struct list_head watches;
+
+ /* Partial request. */
+ unsigned int len;
+ union {
+ struct xsd_sockmsg msg;
+ char buffer[PAGE_SIZE];
+ } u;
+
+ /* Response queue. */
+ struct list_head read_buffers;
+ wait_queue_head_t read_waitq;
+
+ struct mutex reply_mutex;
+};
+
+static struct proc_dir_entry *xenbus_dev_intf;
+
+static ssize_t xenbus_dev_read(struct file *filp,
+ char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_dev_data *u = filp->private_data;
+ struct read_buffer *rb;
+ int i, ret;
+
+ mutex_lock(&u->reply_mutex);
+ while (list_empty(&u->read_buffers)) {
+ mutex_unlock(&u->reply_mutex);
+ ret = wait_event_interruptible(u->read_waitq,
+ !list_empty(&u->read_buffers));
+ if (ret)
+ return ret;
+ mutex_lock(&u->reply_mutex);
+ }
+
+ rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+ for (i = 0; i < len;) {
+ put_user(rb->msg[rb->cons], ubuf + i);
+ i++;
+ rb->cons++;
+ if (rb->cons == rb->len) {
+ list_del(&rb->list);
+ kfree(rb);
+ if (list_empty(&u->read_buffers))
+ break;
+ rb = list_entry(u->read_buffers.next,
+ struct read_buffer, list);
+ }
+ }
+ mutex_unlock(&u->reply_mutex);
+
+ return i;
+}
+
+static void queue_reply(struct xenbus_dev_data *u,
+ char *data, unsigned int len)
+{
+ struct read_buffer *rb;
+
+ if (len == 0)
+ return;
+
+ rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+ BUG_ON(rb == NULL);
+
+ rb->cons = 0;
+ rb->len = len;
+
+ memcpy(rb->msg, data, len);
+
+ list_add_tail(&rb->list, &u->read_buffers);
+
+ wake_up(&u->read_waitq);
+}
+
+struct watch_adapter
+{
+ struct list_head list;
+ struct xenbus_watch watch;
+ struct xenbus_dev_data *dev_data;
+ char *token;
+};
+
+static void free_watch_adapter (struct watch_adapter *watch)
+{
+ kfree(watch->watch.node);
+ kfree(watch->token);
+ kfree(watch);
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+ const char **vec,
+ unsigned int len)
+{
+ struct watch_adapter *adap =
+ container_of(watch, struct watch_adapter, watch);
+ struct xsd_sockmsg hdr;
+ const char *path, *token;
+ int path_len, tok_len, body_len;
+
+ path = vec[XS_WATCH_PATH];
+ token = adap->token;
+
+ path_len = strlen(path) + 1;
+ tok_len = strlen(token) + 1;
+ body_len = path_len + tok_len;
+
+ hdr.type = XS_WATCH_EVENT;
+ hdr.len = body_len;
+
+ mutex_lock(&adap->dev_data->reply_mutex);
+ queue_reply(adap->dev_data, (char *)&hdr, sizeof(hdr));
+ queue_reply(adap->dev_data, (char *)path, path_len);
+ queue_reply(adap->dev_data, (char *)token, tok_len);
+ mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static LIST_HEAD(watch_list);
+
+static ssize_t xenbus_dev_write(struct file *filp,
+ const char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_dev_data *u = filp->private_data;
+ struct xenbus_dev_transaction *trans = NULL;
+ uint32_t msg_type;
+ void *reply;
+ char *path, *token;
+ struct watch_adapter *watch, *tmp_watch;
+ int err, rc = len;
+
+ if ((len + u->len) > sizeof(u->u.buffer)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ u->len += len;
+ if ((u->len < sizeof(u->u.msg)) ||
+ (u->len < (sizeof(u->u.msg) + u->u.msg.len)))
+ return rc;
+
+ msg_type = u->u.msg.type;
+
+ switch (msg_type) {
+ case XS_TRANSACTION_START:
+ case XS_TRANSACTION_END:
+ case XS_DIRECTORY:
+ case XS_READ:
+ case XS_GET_PERMS:
+ case XS_RELEASE:
+ case XS_GET_DOMAIN_PATH:
+ case XS_WRITE:
+ case XS_MKDIR:
+ case XS_RM:
+ case XS_SET_PERMS:
+ if (msg_type == XS_TRANSACTION_START) {
+ trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+ if (!trans) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ reply = xenbus_dev_request_and_reply(&u->u.msg);
+ if (IS_ERR(reply)) {
+ kfree(trans);
+ rc = PTR_ERR(reply);
+ goto out;
+ }
+
+ if (msg_type == XS_TRANSACTION_START) {
+ trans->handle.id = simple_strtoul(reply, NULL, 0);
+ list_add(&trans->list, &u->transactions);
+ } else if (msg_type == XS_TRANSACTION_END) {
+ list_for_each_entry(trans, &u->transactions, list)
+ if (trans->handle.id == u->u.msg.tx_id)
+ break;
+ BUG_ON(&trans->list == &u->transactions);
+ list_del(&trans->list);
+ kfree(trans);
+ }
+ mutex_lock(&u->reply_mutex);
+ queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
+ queue_reply(u, (char *)reply, u->u.msg.len);
+ mutex_unlock(&u->reply_mutex);
+ kfree(reply);
+ break;
+
+ case XS_WATCH:
+ case XS_UNWATCH: {
+ static const char *XS_RESP = "OK";
+ struct xsd_sockmsg hdr;
+
+ path = u->u.buffer + sizeof(u->u.msg);
+ token = memchr(path, 0, u->u.msg.len);
+ if (token == NULL) {
+ rc = -EILSEQ;
+ goto out;
+ }
+ token++;
+
+ if (msg_type == XS_WATCH) {
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ watch->watch.node = kmalloc(strlen(path)+1,
+ GFP_KERNEL);
+ strcpy((char *)watch->watch.node, path);
+ watch->watch.callback = watch_fired;
+ watch->token = kmalloc(strlen(token)+1, GFP_KERNEL);
+ strcpy(watch->token, token);
+ watch->dev_data = u;
+
+ err = register_xenbus_watch(&watch->watch);
+ if (err) {
+ free_watch_adapter(watch);
+ rc = err;
+ goto out;
+ }
+
+ list_add(&watch->list, &u->watches);
+ } else {
+ list_for_each_entry_safe(watch, tmp_watch,
+ &u->watches, list) {
+ if (!strcmp(watch->token, token) &&
+ !strcmp(watch->watch.node, path))
+ {
+ unregister_xenbus_watch(&watch->watch);
+ list_del(&watch->list);
+ free_watch_adapter(watch);
+ break;
+ }
+ }
+ }
+
+ hdr.type = msg_type;
+ hdr.len = strlen(XS_RESP) + 1;
+ mutex_lock(&u->reply_mutex);
+ queue_reply(u, (char *)&hdr, sizeof(hdr));
+ queue_reply(u, (char *)XS_RESP, hdr.len);
+ mutex_unlock(&u->reply_mutex);
+ break;
+ }
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ out:
+ u->len = 0;
+ return rc;
+}
+
+static int xenbus_dev_open(struct inode *inode, struct file *filp)
+{
+ struct xenbus_dev_data *u;
+
+ if (xen_store_evtchn == 0)
+ return -ENOENT;
+
+ nonseekable_open(inode, filp);
+
+ u = kzalloc(sizeof(*u), GFP_KERNEL);
+ if (u == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&u->transactions);
+ INIT_LIST_HEAD(&u->watches);
+ INIT_LIST_HEAD(&u->read_buffers);
+ init_waitqueue_head(&u->read_waitq);
+
+ mutex_init(&u->reply_mutex);
+
+ filp->private_data = u;
+
+ return 0;
+}
+
+static int xenbus_dev_release(struct inode *inode, struct file *filp)
+{
+ struct xenbus_dev_data *u = filp->private_data;
+ struct xenbus_dev_transaction *trans, *tmp;
+ struct watch_adapter *watch, *tmp_watch;
+
+ list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+ xenbus_transaction_end(trans->handle, 1);
+ list_del(&trans->list);
+ kfree(trans);
+ }
+
+ list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+ unregister_xenbus_watch(&watch->watch);
+ list_del(&watch->list);
+ free_watch_adapter(watch);
+ }
+
+ kfree(u);
+
+ return 0;
+}
+
+static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
+{
+ struct xenbus_dev_data *u = file->private_data;
+
+ poll_wait(file, &u->read_waitq, wait);
+ if (!list_empty(&u->read_buffers))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static const struct file_operations xenbus_dev_file_ops = {
+ .read = xenbus_dev_read,
+ .write = xenbus_dev_write,
+ .open = xenbus_dev_open,
+ .release = xenbus_dev_release,
+ .poll = xenbus_dev_poll,
+};
+
+int xenbus_dev_init(void)
+{
+ xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
+ if (xenbus_dev_intf)
+ xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
+
+ return 0;
+}
* Copyright (C) 2005 Rusty Russell, IBM Corporation
* Copyright (C) 2005 Mike Wray, Hewlett-Packard
* Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
#define DPRINTK(fmt, args...) \
pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
- __func__, __LINE__, ##args)
+ __FUNCTION__, __LINE__, ##args)
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/notifier.h>
-#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/io.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen_proc.h>
+#include <xen/evtchn.h>
+#include <xen/features.h>
+#ifdef MODULE
+#include <xen/hvm.h>
+#endif
+#else
#include <asm/xen/hypervisor.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/page.h>
+#endif
#include "xenbus_comms.h"
#include "xenbus_probe.h"
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
int xen_store_evtchn;
struct xenstore_domain_interface *xen_store_interface;
static unsigned long xen_store_mfn;
+extern struct mutex xenwatch_mutex;
+
static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
static void wait_for_devices(struct xenbus_driver *xendrv);
return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
}
-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
-{
- struct xenbus_device *dev = to_xenbus_device(_dev);
-
- if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
- return -ENOMEM;
-
- return 0;
-}
-
/* device/<type>/<id> => <type>-<id> */
static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
{
return read_otherend_details(xendev, "backend-id", "backend");
}
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct xenbus_device *xdev;
+
+ if (dev == NULL)
+ return -ENODEV;
+ xdev = to_xenbus_device(dev);
+ if (xdev == NULL)
+ return -ENODEV;
+
+ /* stuff we want to pass to /sbin/hotplug */
+#if defined(CONFIG_XEN) || defined(MODULE)
+ add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
+ add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
+#endif
+ add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype);
+
+ return 0;
+}
+#endif
/* Bus type for frontend drivers. */
static struct xen_bus_type xenbus_frontend = {
.levels = 2, /* device/type/<id> */
.get_bus_id = frontend_bus_id,
.probe = xenbus_probe_frontend,
+ .error = -ENODEV,
.bus = {
.name = "xen",
.match = xenbus_match,
- .uevent = xenbus_uevent,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
.probe = xenbus_dev_probe,
.remove = xenbus_dev_remove,
.shutdown = xenbus_dev_shutdown,
+ .uevent = xenbus_uevent_frontend,
+#endif
},
+#if defined(CONFIG_XEN) || defined(MODULE)
+ .dev = {
+ .bus_id = "xen",
+ },
+#endif
};
static void otherend_changed(struct xenbus_watch *watch,
if (!dev->otherend ||
strncmp(dev->otherend, vec[XS_WATCH_PATH],
strlen(dev->otherend))) {
- dev_dbg(&dev->dev, "Ignoring watch at %s\n",
- vec[XS_WATCH_PATH]);
+ dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
return;
}
state = xenbus_read_driver_state(dev->otherend);
- dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
+ dev_dbg(&dev->dev, "state is %d (%s), %s, %s",
state, xenbus_strstate(state), dev->otherend_watch.node,
vec[XS_WATCH_PATH]);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
/*
* Ignore xenbus transitions during shutdown. This prevents us doing
* work that can fail e.g., when the rootfs is gone.
xenbus_frontend_closed(dev);
return;
}
+#endif
if (drv->otherend_changed)
drv->otherend_changed(dev, state);
static int watch_otherend(struct xenbus_device *dev)
{
+#if defined(CONFIG_XEN) || defined(MODULE)
+ return xenbus_watch_path2(dev, dev->otherend, "state",
+ &dev->otherend_watch, otherend_changed);
+#else
return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
"%s/%s", dev->otherend, "state");
+#endif
}
err = talk_to_otherend(dev);
if (err) {
- dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
- dev->nodename);
+ dev_warn(&dev->dev,
+ "xenbus_probe: talk_to_otherend on %s failed.\n",
+ dev->nodename);
return err;
}
err = watch_otherend(dev);
if (err) {
- dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
- dev->nodename);
+ dev_warn(&dev->dev,
+ "xenbus_probe: watch_otherend on %s failed.\n",
+ dev->nodename);
return err;
}
DPRINTK("%s", dev->nodename);
+ if (is_initial_xendomain())
+ return;
+
get_device(&dev->dev);
if (dev->state != XenbusStateConnected) {
- printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
- dev->nodename, xenbus_strstate(dev->state));
+ dev_info(&dev->dev, "%s: %s: %s != Connected, skipping\n", __FUNCTION__,
+ dev->nodename, xenbus_strstate(dev->state));
goto out;
}
xenbus_switch_state(dev, XenbusStateClosing);
timeout = wait_for_completion_timeout(&dev->down, timeout);
if (!timeout)
- printk(KERN_INFO "%s: %s timeout closing device\n",
- __func__, dev->nodename);
+ dev_info(&dev->dev, "%s: %s timeout closing device\n",
+ __FUNCTION__, dev->nodename);
out:
put_device(&dev->dev);
}
struct module *owner,
const char *mod_name)
{
+ int ret;
+
+ if (bus->error)
+ return bus->error;
+
drv->driver.name = drv->name;
drv->driver.bus = &bus->bus;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
drv->driver.owner = owner;
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21)
drv->driver.mod_name = mod_name;
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+ drv->driver.probe = xenbus_dev_probe;
+ drv->driver.remove = xenbus_dev_remove;
+ drv->driver.shutdown = xenbus_dev_shutdown;
+#endif
- return driver_register(&drv->driver);
+ mutex_lock(&xenwatch_mutex);
+ ret = driver_register(&drv->driver);
+ mutex_unlock(&xenwatch_mutex);
+ return ret;
}
int __xenbus_register_frontend(struct xenbus_driver *drv,
}
static ssize_t xendev_show_nodename(struct device *dev,
- struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+ struct device_attribute *attr,
+#endif
+ char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
}
DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
static ssize_t xendev_show_devtype(struct device *dev,
- struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+ struct device_attribute *attr,
+#endif
+ char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
}
DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
static ssize_t xendev_show_modalias(struct device *dev,
- struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+ struct device_attribute *attr,
+#endif
+ char *buf)
{
return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
}
enum xenbus_state state = xenbus_read_driver_state(nodename);
+ if (bus->error)
+ return bus->error;
+
if (state != XenbusStateInitialising) {
/* Device is not new, so ignore it. This can happen if a
device is going away after switching to Closed. */
xendev->devicetype = tmpstring;
init_completion(&xendev->down);
+#if defined(CONFIG_XEN) || defined(MODULE)
+ xendev->dev.parent = &bus->dev;
+#endif
xendev->dev.bus = &bus->bus;
xendev->dev.release = xenbus_dev_release;
char **dir;
unsigned int i, dir_n;
+ if (bus->error)
+ return bus->error;
+
dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
if (IS_ERR(dir))
return PTR_ERR(dir);
char type[BUS_ID_SIZE];
const char *p, *root;
- if (char_count(node, '/') < 2)
+ if (bus->error || char_count(node, '/') < 2)
return;
exists = xenbus_exists(XBT_NIL, node, "");
{
DPRINTK("");
- bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
+ if (!xenbus_frontend.error)
+ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
xenbus_backend_suspend(suspend_dev);
xs_suspend();
}
{
xb_init_comms();
xs_resume();
- bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
+ if (!xenbus_frontend.error)
+ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
xenbus_backend_resume(resume_dev);
}
EXPORT_SYMBOL_GPL(xenbus_resume);
void xenbus_suspend_cancel(void)
{
xs_suspend_cancel();
- bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
+ if (!xenbus_frontend.error)
+ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
xenbus_backend_resume(suspend_cancel_dev);
}
EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
}
EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
+
void xenbus_probe(struct work_struct *unused)
{
BUG_ON((xenstored_ready <= 0));
blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
}
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
+static struct file_operations xsd_kva_fops;
+static struct proc_dir_entry *xsd_kva_intf;
+static struct proc_dir_entry *xsd_port_intf;
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+
+ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+ return -EINVAL;
+
+ if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn),
+ size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+static int xsd_kva_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len;
+
+ len = sprintf(page, "0x%p", xen_store_interface);
+ *eof = 1;
+ return len;
+}
+
+static int xsd_port_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len;
+
+ len = sprintf(page, "%d", xen_store_evtchn);
+ *eof = 1;
+ return len;
+}
+#endif
+
+#ifndef MODULE
static int __init xenbus_probe_init(void)
+#else
+static int __devinit xenbus_probe_init(void)
+#endif
{
int err = 0;
+#if defined(CONFIG_XEN) || defined(MODULE)
+ unsigned long page = 0;
+#endif
DPRINTK("");
- err = -ENODEV;
if (!is_running_on_xen())
- goto out_error;
+ return -ENODEV;
/* Register ourselves with the kernel bus subsystem */
- err = bus_register(&xenbus_frontend.bus);
- if (err)
- goto out_error;
-
- err = xenbus_backend_bus_register();
- if (err)
- goto out_unreg_front;
+ xenbus_frontend.error = bus_register(&xenbus_frontend.bus);
+ if (xenbus_frontend.error)
+ printk(KERN_WARNING
+ "XENBUS: Error registering frontend bus: %i\n",
+ xenbus_frontend.error);
+ xenbus_backend_bus_register();
/*
* Domain0 doesn't have a store_evtchn or store_mfn yet.
*/
if (is_initial_xendomain()) {
+#if defined(CONFIG_XEN) || defined(MODULE)
+ struct evtchn_alloc_unbound alloc_unbound;
+
+ /* Allocate page. */
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ xen_store_mfn = xen_start_info->store_mfn =
+ pfn_to_mfn(virt_to_phys((void *)page) >>
+ PAGE_SHIFT);
+
+ /* Next allocate a local port which xenstored can bind to */
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = 0;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (err == -ENOSYS)
+ goto err;
+ BUG_ON(err);
+ xen_store_evtchn = xen_start_info->store_evtchn =
+ alloc_unbound.port;
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
+ /* And finally publish the above info in /proc/xen */
+ xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
+ if (xsd_kva_intf) {
+ memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
+ sizeof(xsd_kva_fops));
+ xsd_kva_fops.mmap = xsd_kva_mmap;
+ xsd_kva_intf->proc_fops = &xsd_kva_fops;
+ xsd_kva_intf->read_proc = xsd_kva_read;
+ }
+ xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
+ if (xsd_port_intf)
+ xsd_port_intf->read_proc = xsd_port_read;
+#endif
+#else
/* dom0 not yet supported */
+#endif
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
} else {
xenstored_ready = 1;
+#ifndef MODULE
xen_store_evtchn = xen_start_info->store_evtchn;
xen_store_mfn = xen_start_info->store_mfn;
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
+#else
+ xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
+ xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
+ xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT,
+ PAGE_SIZE);
+#endif
}
- xen_store_interface = mfn_to_virt(xen_store_mfn);
+
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+ xenbus_dev_init();
+#endif
/* Initialize the interface to xenstore. */
err = xs_init();
if (err) {
printk(KERN_WARNING
"XENBUS: Error initializing xenstore comms: %i\n", err);
- goto out_unreg_back;
+ goto err;
}
+#if defined(CONFIG_XEN) || defined(MODULE)
+ /* Register ourselves with the kernel device subsystem */
+ if (!xenbus_frontend.error) {
+ xenbus_frontend.error = device_register(&xenbus_frontend.dev);
+ if (xenbus_frontend.error) {
+ bus_unregister(&xenbus_frontend.bus);
+ printk(KERN_WARNING
+ "XENBUS: Error registering frontend device: %i\n",
+ xenbus_frontend.error);
+ }
+ }
+#endif
+ xenbus_backend_device_register();
+
if (!is_initial_xendomain())
xenbus_probe(NULL);
return 0;
- out_unreg_back:
- xenbus_backend_bus_unregister();
+ err:
+#if defined(CONFIG_XEN) || defined(MODULE)
+ if (page)
+ free_page(page);
+#endif
- out_unreg_front:
- bus_unregister(&xenbus_frontend.bus);
+ /*
+ * Do not unregister the xenbus front/backend buses here. The buses
+ * must exist because front/backend drivers will use them when they are
+ * registered.
+ */
- out_error:
return err;
}
+#ifndef MODULE
postcore_initcall(xenbus_probe_init);
-
+#ifdef CONFIG_XEN
+MODULE_LICENSE("Dual BSD/GPL");
+#else
MODULE_LICENSE("GPL");
+#endif
+#else
+int __devinit xenbus_init(void)
+{
+ return xenbus_probe_init();
+}
+#endif
static int is_disconnected_device(struct device *dev, void *data)
{
return 0;
xendrv = to_xenbus_driver(dev->driver);
- return (xendev->state != XenbusStateConnected ||
+ return (xendev->state < XenbusStateConnected ||
(xendrv->is_ready && !xendrv->is_ready(xendev)));
}
static int exists_disconnected_device(struct device_driver *drv)
{
+ if (xenbus_frontend.error)
+ return xenbus_frontend.error;
return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
is_disconnected_device);
}
{
struct xenbus_device *xendev = to_xenbus_device(dev);
struct device_driver *drv = data;
+ struct xenbus_driver *xendrv;
/* Is this operation limited to a particular driver? */
if (drv && (dev->driver != drv))
/* Information only: is this too noisy? */
printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
xendev->nodename);
- } else if (xendev->state != XenbusStateConnected) {
+ return 0;
+ }
+
+ if (xendev->state < XenbusStateConnected) {
+ enum xenbus_state rstate = XenbusStateUnknown;
+ if (xendev->otherend)
+ rstate = xenbus_read_driver_state(xendev->otherend);
printk(KERN_WARNING "XENBUS: Timeout connecting "
- "to device: %s (state %d)\n",
- xendev->nodename, xendev->state);
+ "to device: %s (local state %d, remote state %d)\n",
+ xendev->nodename, xendev->state, rstate);
}
+ xendrv = to_xenbus_driver(dev->driver);
+ if (xendrv->is_ready && !xendrv->is_ready(xendev))
+ printk(KERN_WARNING "XENBUS: Device not ready: %s\n",
+ xendev->nodename);
+
return 0;
}
static int ready_to_wait_for_devices;
/*
- * On a 10 second timeout, wait for all devices currently configured. We need
+ * On a 5-minute timeout, wait for all devices currently configured. We need
* to do this to guarantee that the filesystems and / or network devices
* needed for boot are available, before we can allow the boot to proceed.
*
*/
static void wait_for_devices(struct xenbus_driver *xendrv)
{
- unsigned long timeout = jiffies + 10*HZ;
+ unsigned long start = jiffies;
struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+ unsigned int seconds_waited = 0;
if (!ready_to_wait_for_devices || !is_running_on_xen())
return;
while (exists_disconnected_device(drv)) {
- if (time_after(jiffies, timeout))
- break;
+ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+ if (!seconds_waited)
+ printk(KERN_WARNING "XENBUS: Waiting for "
+ "devices to initialise: ");
+ seconds_waited += 5;
+ printk("%us...", 300 - seconds_waited);
+ if (seconds_waited == 300)
+ break;
+ }
+
schedule_timeout_interruptible(HZ/10);
}
+ if (seconds_waited)
+ printk("\n");
+
bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
print_device_status);
}
#ifndef MODULE
static int __init boot_wait_for_devices(void)
{
- ready_to_wait_for_devices = 1;
- wait_for_devices(NULL);
+ if (!xenbus_frontend.error) {
+ ready_to_wait_for_devices = 1;
+ wait_for_devices(NULL);
+ }
return 0;
}
late_initcall(boot_wait_for_devices);
#endif
+
+int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *))
+{
+ return bus_for_each_dev(&xenbus_frontend.bus, NULL, arg, fn);
+}
+EXPORT_SYMBOL_GPL(xenbus_for_each_frontend);
#ifndef _XENBUS_PROBE_H
#define _XENBUS_PROBE_H
-#ifdef CONFIG_XEN_BACKEND
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
extern void xenbus_backend_probe_and_watch(void);
-extern int xenbus_backend_bus_register(void);
-extern void xenbus_backend_bus_unregister(void);
+extern void xenbus_backend_bus_register(void);
+extern void xenbus_backend_device_register(void);
#else
static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
static inline void xenbus_backend_probe_and_watch(void) {}
-static inline int xenbus_backend_bus_register(void) { return 0; }
-static inline void xenbus_backend_bus_unregister(void) {}
+static inline void xenbus_backend_bus_register(void) {}
+static inline void xenbus_backend_device_register(void) {}
#endif
struct xen_bus_type
{
char *root;
+ int error;
unsigned int levels;
int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
int (*probe)(const char *type, const char *dir);
struct bus_type bus;
+#if defined(CONFIG_XEN) || defined(MODULE)
+ struct device dev;
+#endif
};
extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
#endif
+
--- /dev/null
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have (backend half).
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define DPRINTK(fmt, args...) \
+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
+ __FUNCTION__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/maddr.h>
+#include <asm/pgtable.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen_proc.h>
+#include <xen/evtchn.h>
+#include <xen/features.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+static int xenbus_uevent_backend(struct device *dev, struct kobj_uevent_env *env);
+static int xenbus_probe_backend(const char *type, const char *domid);
+
+extern int read_otherend_details(struct xenbus_device *xendev,
+ char *id_node, char *path_node);
+
+static int read_frontend_details(struct xenbus_device *xendev)
+{
+ return read_otherend_details(xendev, "frontend-id", "frontend");
+}
+
+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
+static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
+{
+ int domid, err;
+ const char *devid, *type, *frontend;
+ unsigned int typelen;
+
+ type = strchr(nodename, '/');
+ if (!type)
+ return -EINVAL;
+ type++;
+ typelen = strcspn(type, "/");
+ if (!typelen || type[typelen] != '/')
+ return -EINVAL;
+
+ devid = strrchr(nodename, '/') + 1;
+
+ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
+ "frontend", NULL, &frontend,
+ NULL);
+ if (err)
+ return err;
+ if (strlen(frontend) == 0)
+ err = -ERANGE;
+ if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
+ err = -ENOENT;
+ kfree(frontend);
+
+ if (err)
+ return err;
+
+ if (snprintf(bus_id, BUS_ID_SIZE,
+ "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
+ return -ENOSPC;
+ return 0;
+}
+
+static struct xen_bus_type xenbus_backend = {
+ .root = "backend",
+ .levels = 3, /* backend/type/<frontend>/<id> */
+ .get_bus_id = backend_bus_id,
+ .probe = xenbus_probe_backend,
+ .error = -ENODEV,
+ .bus = {
+ .name = "xen-backend",
+ .match = xenbus_match,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+// .shutdown = xenbus_dev_shutdown,
+ .uevent = xenbus_uevent_backend,
+ },
+ .dev = {
+ .bus_id = "xen-backend",
+ },
+};
+
+static int xenbus_uevent_backend(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct xenbus_device *xdev;
+ struct xenbus_driver *drv;
+
+ DPRINTK("");
+
+ if (dev == NULL)
+ return -ENODEV;
+
+ xdev = to_xenbus_device(dev);
+ if (xdev == NULL)
+ return -ENODEV;
+
+ /* stuff we want to pass to /sbin/hotplug */
+ add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
+
+ add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
+
+ add_uevent_var(env, "XENBUS_BASE_PATH=%s", xenbus_backend.root);
+
+ if (dev->driver) {
+ drv = to_xenbus_driver(dev->driver);
+ if (drv && drv->uevent)
+ return drv->uevent(xdev, env);
+ }
+
+ return 0;
+}
+
+int __xenbus_register_backend(struct xenbus_driver *drv,
+ struct module *owner, const char *mod_name)
+{
+ drv->read_otherend_details = read_frontend_details;
+
+ return xenbus_register_driver_common(drv, &xenbus_backend,
+ owner, mod_name);
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_backend);
+
+/* backend/<typename>/<frontend-uuid>/<name> */
+static int xenbus_probe_backend_unit(const char *dir,
+ const char *type,
+ const char *name)
+{
+ char *nodename;
+ int err;
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s\n", nodename);
+
+ err = xenbus_probe_node(&xenbus_backend, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+/* backend/<typename>/<frontend-domid> */
+static int xenbus_probe_backend(const char *type, const char *domid)
+{
+ char *nodename;
+ int err = 0;
+ char **dir;
+ unsigned int i, dir_n = 0;
+
+ DPRINTK("");
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid);
+ if (!nodename)
+ return -ENOMEM;
+
+ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
+ if (IS_ERR(dir)) {
+ kfree(nodename);
+ return PTR_ERR(dir);
+ }
+
+ for (i = 0; i < dir_n; i++) {
+ err = xenbus_probe_backend_unit(nodename, type, dir[i]);
+ if (err)
+ break;
+ }
+ kfree(dir);
+ kfree(nodename);
+ return err;
+}
+
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+}
+
+static struct xenbus_watch be_watch = {
+ .node = "backend",
+ .callback = backend_changed,
+};
+
+void xenbus_backend_suspend(int (*fn)(struct device *, void *))
+{
+ DPRINTK("");
+ if (!xenbus_backend.error)
+ bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+}
+
+void xenbus_backend_resume(int (*fn)(struct device *, void *))
+{
+ DPRINTK("");
+ if (!xenbus_backend.error)
+ bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+}
+
+void xenbus_backend_probe_and_watch(void)
+{
+ xenbus_probe_devices(&xenbus_backend);
+ register_xenbus_watch(&be_watch);
+}
+
+void xenbus_backend_bus_register(void)
+{
+ xenbus_backend.error = bus_register(&xenbus_backend.bus);
+ if (xenbus_backend.error)
+ printk(KERN_WARNING
+ "XENBUS: Error registering backend bus: %i\n",
+ xenbus_backend.error);
+}
+
+void xenbus_backend_device_register(void)
+{
+ if (xenbus_backend.error)
+ return;
+
+ xenbus_backend.error = device_register(&xenbus_backend.dev);
+ if (xenbus_backend.error) {
+ bus_unregister(&xenbus_backend.bus);
+ printk(KERN_WARNING
+ "XENBUS: Error registering backend device: %i\n",
+ xenbus_backend.error);
+ }
+}
+
+int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *))
+{
+ return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn);
+}
+EXPORT_SYMBOL_GPL(xenbus_for_each_backend);
#include <xen/xenbus.h>
#include "xenbus_comms.h"
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#ifndef PF_NOFREEZE /* Old kernel (pre-2.6.6). */
+#define PF_NOFREEZE 0
+#endif
+
struct xs_stored_msg {
struct list_head list;
* carrying out work.
*/
static pid_t xenwatch_pid;
-static DEFINE_MUTEX(xenwatch_mutex);
+/* static */ DEFINE_MUTEX(xenwatch_mutex);
static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
static int get_error(const char *errorstring)
mutex_unlock(&xs_state.request_mutex);
- if ((msg->type == XS_TRANSACTION_END) ||
+ if ((req_msg.type == XS_TRANSACTION_END) ||
((req_msg.type == XS_TRANSACTION_START) &&
(msg->type == XS_ERROR)))
up_read(&xs_state.transaction_mutex);
char *buffer;
if (strlen(name) == 0)
- buffer = kasprintf(GFP_KERNEL, "%s", dir);
+ buffer = kasprintf(GFP_KERNEL|__GFP_HIGH, "%s", dir);
else
- buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+ buffer = kasprintf(GFP_KERNEL|__GFP_HIGH, "%s/%s", dir, name);
return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
}
*num = count_strings(strings, len);
/* Transfer to one big alloc for easy freeing. */
- ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
+ ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL|__GFP_HIGH);
if (!ret) {
kfree(strings);
return ERR_PTR(-ENOMEM);
#define PRINTF_BUFFER_SIZE 4096
char *printf_buffer;
- printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL|__GFP_HIGH);
if (printf_buffer == NULL)
return -ENOMEM;
char token[sizeof(watch) * 2 + 1];
int err;
+#if defined(CONFIG_XEN) || defined(MODULE)
+ BUG_ON(watch->flags & XBWF_new_thread);
+#endif
+
sprintf(token, "%lX", (long)watch);
down_read(&xs_state.watch_mutex);
up_write(&xs_state.transaction_mutex);
}
+#if defined(CONFIG_XEN) || defined(MODULE)
+static int xenwatch_handle_callback(void *data)
+{
+ struct xs_stored_msg *msg = data;
+
+ msg->u.watch.handle->callback(msg->u.watch.handle,
+ (const char **)msg->u.watch.vec,
+ msg->u.watch.vec_size);
+
+ kfree(msg->u.watch.vec);
+ kfree(msg);
+
+ /* Kill this kthread if we were spawned just for this callback. */
+ if (current->pid != xenwatch_pid)
+ do_exit(0);
+
+ return 0;
+}
+#endif
+
static int xenwatch_thread(void *unused)
{
struct list_head *ent;
struct xs_stored_msg *msg;
+ current->flags |= PF_NOFREEZE;
for (;;) {
wait_event_interruptible(watch_events_waitq,
!list_empty(&watch_events));
list_del(ent);
spin_unlock(&watch_events_lock);
- if (ent != &watch_events) {
- msg = list_entry(ent, struct xs_stored_msg, list);
- msg->u.watch.handle->callback(
- msg->u.watch.handle,
- (const char **)msg->u.watch.vec,
- msg->u.watch.vec_size);
- kfree(msg->u.watch.vec);
- kfree(msg);
+ if (ent == &watch_events) {
+ mutex_unlock(&xenwatch_mutex);
+ continue;
}
+ msg = list_entry(ent, struct xs_stored_msg, list);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+ /*
+ * Unlock the mutex before running an XBWF_new_thread
+ * handler. kthread_run can block which can deadlock
+ * against unregister_xenbus_watch() if we need to
+ * unregister other watches in order to make
+ * progress. This can occur on resume before the swap
+ * device is attached.
+ */
+ if (msg->u.watch.handle->flags & XBWF_new_thread) {
+ mutex_unlock(&xenwatch_mutex);
+ kthread_run(xenwatch_handle_callback,
+ msg, "xenwatch_cb");
+ } else {
+ xenwatch_handle_callback(msg);
+ mutex_unlock(&xenwatch_mutex);
+ }
+#else
+ msg->u.watch.handle->callback(
+ msg->u.watch.handle,
+ (const char **)msg->u.watch.vec,
+ msg->u.watch.vec_size);
mutex_unlock(&xenwatch_mutex);
+ kfree(msg->u.watch.vec);
+ kfree(msg);
+#endif
}
return 0;
}
- msg = kmalloc(sizeof(*msg), GFP_KERNEL);
+ msg = kmalloc(sizeof(*msg), GFP_KERNEL|__GFP_HIGH);
if (msg == NULL) {
err = -ENOMEM;
goto out;
goto out;
}
- body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
+ body = kmalloc(msg->hdr.len + 1, GFP_KERNEL|__GFP_HIGH);
if (body == NULL) {
kfree(msg);
err = -ENOMEM;
{
int err;
+ current->flags |= PF_NOFREEZE;
for (;;) {
err = process_msg();
if (err)
--- /dev/null
+/**
+ * @file xenoprofile.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon and Jose Renato Santos for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
+ * Separated out arch-generic part
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/oprofile.h>
+#include <linux/sysdev.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable.h>
+#include <xen/evtchn.h>
+#include <xen/xenoprof.h>
+#include <xen/driver_util.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xenoprof.h>
+#include "../../../drivers/oprofile/cpu_buffer.h"
+#include "../../../drivers/oprofile/event_buffer.h"
+
+#define MAX_XENOPROF_SAMPLES 16
+
+/* sample buffers shared with Xen */
+xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS];
+/* Shared buffer area */
+struct xenoprof_shared_buffer shared_buffer;
+
+/* Passive sample buffers shared with Xen */
+xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS];
+/* Passive shared buffer area */
+struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS];
+
+static int xenoprof_start(void);
+static void xenoprof_stop(void);
+
+static int xenoprof_enabled = 0;
+static int xenoprof_is_primary = 0;
+static int active_defined;
+
+extern unsigned long backtrace_depth;
+
+/* Number of buffers in shared area (one per VCPU) */
+int nbuf;
+/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */
+int ovf_irq[NR_CPUS];
+/* cpu model type string - copied from Xen on XENOPROF_init command */
+char cpu_type[XENOPROF_CPU_TYPE_SIZE];
+
+#ifdef CONFIG_PM
+
+static int xenoprof_suspend(struct sys_device * dev, pm_message_t state)
+{
+ if (xenoprof_enabled == 1)
+ xenoprof_stop();
+ return 0;
+}
+
+
+static int xenoprof_resume(struct sys_device * dev)
+{
+ if (xenoprof_enabled == 1)
+ xenoprof_start();
+ return 0;
+}
+
+
+static struct sysdev_class oprofile_sysclass = {
+ .name = "oprofile",
+ .resume = xenoprof_resume,
+ .suspend = xenoprof_suspend
+};
+
+
+static struct sys_device device_oprofile = {
+ .id = 0,
+ .cls = &oprofile_sysclass,
+};
+
+
+static int __init init_driverfs(void)
+{
+ int error;
+ if (!(error = sysdev_class_register(&oprofile_sysclass)))
+ error = sysdev_register(&device_oprofile);
+ return error;
+}
+
+
+static void exit_driverfs(void)
+{
+ sysdev_unregister(&device_oprofile);
+ sysdev_class_unregister(&oprofile_sysclass);
+}
+
+#else
+#define init_driverfs() do { } while (0)
+#define exit_driverfs() do { } while (0)
+#endif /* CONFIG_PM */
+
+unsigned long long oprofile_samples = 0;
+unsigned long long p_oprofile_samples = 0;
+
+unsigned int pdomains;
+struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
+
+/* Check whether the given entry is an escape code */
+static int xenoprof_is_escape(xenoprof_buf_t * buf, int tail)
+{
+ return (buf->event_log[tail].eip == XENOPROF_ESCAPE_CODE);
+}
+
+/* Get the event at the given entry */
+static uint8_t xenoprof_get_event(xenoprof_buf_t * buf, int tail)
+{
+ return (buf->event_log[tail].event);
+}
+
+static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
+{
+ int head, tail, size;
+ int tracing = 0;
+
+ head = buf->event_head;
+ tail = buf->event_tail;
+ size = buf->event_size;
+
+ while (tail != head) {
+ if (xenoprof_is_escape(buf, tail) &&
+ xenoprof_get_event(buf, tail) == XENOPROF_TRACE_BEGIN) {
+ tracing=1;
+ oprofile_add_pc(ESCAPE_CODE, buf->event_log[tail].mode,
+ CPU_TRACE_BEGIN);
+ if (!is_passive)
+ oprofile_samples++;
+ else
+ p_oprofile_samples++;
+
+ } else {
+ oprofile_add_pc(buf->event_log[tail].eip,
+ buf->event_log[tail].mode,
+ buf->event_log[tail].event);
+ if (!tracing) {
+ if (!is_passive)
+ oprofile_samples++;
+ else
+ p_oprofile_samples++;
+ }
+
+ }
+ tail++;
+ if(tail==size)
+ tail=0;
+ }
+ buf->event_tail = tail;
+}
+
+static void xenoprof_handle_passive(void)
+{
+ int i, j;
+ int flag_domain, flag_switch = 0;
+
+ for (i = 0; i < pdomains; i++) {
+ flag_domain = 0;
+ for (j = 0; j < passive_domains[i].nbuf; j++) {
+ xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
+ if (buf->event_head == buf->event_tail)
+ continue;
+ if (!flag_domain) {
+ if (!oprofile_add_domain_switch(
+ passive_domains[i].domain_id))
+ goto done;
+ flag_domain = 1;
+ }
+ xenoprof_add_pc(buf, 1);
+ flag_switch = 1;
+ }
+ }
+done:
+ if (flag_switch)
+ oprofile_add_domain_switch(COORDINATOR_DOMAIN);
+}
+
+static irqreturn_t
+xenoprof_ovf_interrupt(int irq, void * dev_id)
+{
+ struct xenoprof_buf * buf;
+ static unsigned long flag;
+
+ buf = xenoprof_buf[smp_processor_id()];
+
+ xenoprof_add_pc(buf, 0);
+
+ if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) {
+ xenoprof_handle_passive();
+ smp_mb__before_clear_bit();
+ clear_bit(0, &flag);
+ }
+
+ return IRQ_HANDLED;
+}
+
+
+static void unbind_virq(void)
+{
+ unsigned int i;
+
+ for_each_online_cpu(i) {
+ if (ovf_irq[i] >= 0) {
+ unbind_from_irqhandler(ovf_irq[i], NULL);
+ ovf_irq[i] = -1;
+ }
+ }
+}
+
+
+static int bind_virq(void)
+{
+ unsigned int i;
+ int result;
+
+ for_each_online_cpu(i) {
+ result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
+ i,
+ xenoprof_ovf_interrupt,
+ IRQF_DISABLED|IRQF_NOBALANCING,
+ "xenoprof",
+ NULL);
+
+ if (result < 0) {
+ unbind_virq();
+ return result;
+ }
+
+ ovf_irq[i] = result;
+ }
+
+ return 0;
+}
+
+
+static void unmap_passive_list(void)
+{
+ int i;
+ for (i = 0; i < pdomains; i++)
+ xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
+ pdomains = 0;
+}
+
+
+static int map_xenoprof_buffer(int max_samples)
+{
+ struct xenoprof_get_buffer get_buffer;
+ struct xenoprof_buf *buf;
+ int ret, i;
+
+ if ( shared_buffer.buffer )
+ return 0;
+
+ get_buffer.max_samples = max_samples;
+ ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer);
+ if (ret)
+ return ret;
+ nbuf = get_buffer.nbuf;
+
+ for (i=0; i< nbuf; i++) {
+ buf = (struct xenoprof_buf*)
+ &shared_buffer.buffer[i * get_buffer.bufsize];
+ BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
+ xenoprof_buf[buf->vcpu_id] = buf;
+ }
+
+ return 0;
+}
+
+
+static int xenoprof_setup(void)
+{
+ int ret;
+
+ if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) )
+ return ret;
+
+ if ( (ret = bind_virq()) )
+ return ret;
+
+ if (xenoprof_is_primary) {
+ /* Define dom0 as an active domain if not done yet */
+ if (!active_defined) {
+ domid_t domid;
+ ret = HYPERVISOR_xenoprof_op(
+ XENOPROF_reset_active_list, NULL);
+ if (ret)
+ goto err;
+ domid = 0;
+ ret = HYPERVISOR_xenoprof_op(
+ XENOPROF_set_active, &domid);
+ if (ret)
+ goto err;
+ active_defined = 1;
+ }
+
+ if (backtrace_depth > 0) {
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace,
+ &backtrace_depth);
+ if (ret)
+ backtrace_depth = 0;
+ }
+
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
+ if (ret)
+ goto err;
+
+ xenoprof_arch_counter();
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
+ if (ret)
+ goto err;
+ }
+
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
+ if (ret)
+ goto err;
+
+ xenoprof_enabled = 1;
+ return 0;
+ err:
+ unbind_virq();
+ return ret;
+}
+
+
+static void xenoprof_shutdown(void)
+{
+ xenoprof_enabled = 0;
+
+ WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL));
+
+ if (xenoprof_is_primary) {
+ WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_release_counters,
+ NULL));
+ active_defined = 0;
+ }
+
+ unbind_virq();
+
+ xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+ if (xenoprof_is_primary)
+ unmap_passive_list();
+}
+
+
+static int xenoprof_start(void)
+{
+ int ret = 0;
+
+ if (xenoprof_is_primary)
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
+ if (!ret)
+ xenoprof_arch_start();
+ return ret;
+}
+
+
+static void xenoprof_stop(void)
+{
+ if (xenoprof_is_primary)
+ WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL));
+ xenoprof_arch_stop();
+}
+
+
+static int xenoprof_set_active(int * active_domains,
+ unsigned int adomains)
+{
+ int ret = 0;
+ int i;
+ int set_dom0 = 0;
+ domid_t domid;
+
+ if (!xenoprof_is_primary)
+ return 0;
+
+ if (adomains > MAX_OPROF_DOMAINS)
+ return -E2BIG;
+
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
+ if (ret)
+ return ret;
+
+ for (i=0; i<adomains; i++) {
+ domid = active_domains[i];
+ if (domid != active_domains[i]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
+ if (ret)
+ goto out;
+ if (active_domains[i] == 0)
+ set_dom0 = 1;
+ }
+ /* dom0 must always be active but may not be in the list */
+ if (!set_dom0) {
+ domid = 0;
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
+ }
+
+out:
+ if (ret)
+ WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list,
+ NULL));
+ active_defined = !ret;
+ return ret;
+}
+
+static int xenoprof_set_passive(int * p_domains,
+ unsigned int pdoms)
+{
+ int ret;
+ unsigned int i, j;
+ struct xenoprof_buf *buf;
+
+ if (!xenoprof_is_primary)
+ return 0;
+
+ if (pdoms > MAX_OPROF_DOMAINS)
+ return -E2BIG;
+
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
+ if (ret)
+ return ret;
+ unmap_passive_list();
+
+ for (i = 0; i < pdoms; i++) {
+ passive_domains[i].domain_id = p_domains[i];
+ passive_domains[i].max_samples = 2048;
+ ret = xenoprof_arch_set_passive(&passive_domains[i],
+ &p_shared_buffer[i]);
+ if (ret)
+ goto out;
+ for (j = 0; j < passive_domains[i].nbuf; j++) {
+ buf = (struct xenoprof_buf *)
+ &p_shared_buffer[i].buffer[
+ j * passive_domains[i].bufsize];
+ BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
+ p_xenoprof_buf[i][buf->vcpu_id] = buf;
+ }
+ }
+
+ pdomains = pdoms;
+ return 0;
+
+out:
+ for (j = 0; j < i; j++)
+ xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
+
+ return ret;
+}
+
+
+/* The dummy backtrace function to keep oprofile happy
+ * The real backtrace is done in xen
+ */
+static void xenoprof_dummy_backtrace(struct pt_regs * const regs,
+ unsigned int depth)
+{
+ /* this should never be called */
+ BUG();
+ return;
+}
+
+
+
+struct oprofile_operations xenoprof_ops = {
+#ifdef HAVE_XENOPROF_CREATE_FILES
+ .create_files = xenoprof_create_files,
+#endif
+ .set_active = xenoprof_set_active,
+ .set_passive = xenoprof_set_passive,
+ .setup = xenoprof_setup,
+ .shutdown = xenoprof_shutdown,
+ .start = xenoprof_start,
+ .stop = xenoprof_stop,
+ .backtrace = xenoprof_dummy_backtrace
+};
+
+
+/* in order to get driverfs right */
+static int using_xenoprof;
+
+int __init xenoprofile_init(struct oprofile_operations * ops)
+{
+ struct xenoprof_init init;
+ unsigned int i;
+ int ret;
+
+ ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
+ if (!ret) {
+ xenoprof_arch_init_counter(&init);
+ xenoprof_is_primary = init.is_primary;
+
+ /* cpu_type is detected by Xen */
+ cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0;
+ strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1);
+ xenoprof_ops.cpu_type = cpu_type;
+
+ init_driverfs();
+ using_xenoprof = 1;
+ *ops = xenoprof_ops;
+
+ for (i=0; i<NR_CPUS; i++)
+ ovf_irq[i] = -1;
+
+ active_defined = 0;
+ }
+
+ printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n",
+ __func__, ret, init.num_events, xenoprof_is_primary);
+ return ret;
+}
+
+
+void xenoprofile_exit(void)
+{
+ if (using_xenoprof)
+ exit_driverfs();
+
+ xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+ if (xenoprof_is_primary) {
+ unmap_passive_list();
+ WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL));
+ }
+}
bool "HugeTLB file system support"
depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
(S390 && 64BIT) || BROKEN
+ depends on !XEN
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
+#ifdef CONFIG_EPOLL
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#endif
+
#if DEBUG > 1
#define dprintk printk
#else
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
+#ifdef CONFIG_EPOLL
+ if (ctx->file && waitqueue_active(&ctx->poll_wait))
+ wake_up(&ctx->poll_wait);
+#endif
+
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
return ret;
}
/* aio_read_evt
* Pull an event off of the ioctx's event ring. Returns the number of
* events fetched (0 or 1 ;-)
+ * If ent parameter is 0, just returns the number of events that would
+ * be fetched.
* FIXME: make this use cmpxchg.
* TODO: make the ringbuffer user mmap()able (requires FIXME).
*/
head = ring->head % info->nr;
if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head, KM_USER1);
- *ent = *evp;
- head = (head + 1) % info->nr;
- smp_mb(); /* finish reading the event before updatng the head */
- ring->head = head;
- ret = 1;
- put_aio_ring_event(evp, KM_USER1);
+ if (ent) { /* event requested */
+ struct io_event *evp =
+ aio_ring_event(info, head, KM_USER1);
+ *ent = *evp;
+ head = (head + 1) % info->nr;
+ /* finish reading the event before updatng the head */
+ smp_mb();
+ ring->head = head;
+ ret = 1;
+ put_aio_ring_event(evp, KM_USER1);
+ } else /* only need to know availability */
+ ret = 1;
}
spin_unlock(&info->ring_lock);
aio_cancel_all(ioctx);
wait_for_all_aios(ioctx);
+#ifdef CONFIG_EPOLL
+ /* forget the poll file, but it's up to the user to close it */
+ if (ioctx->file) {
+ fput(ioctx->file);
+ ioctx->file->private_data = 0;
+ ioctx->file = 0;
+ }
+#endif
/*
* Wake up any waiters. The setting of ctx->dead must be seen
put_ioctx(ioctx); /* once for the lookup */
}
+#ifdef CONFIG_EPOLL
+
+static int aio_queue_fd_close(struct inode *inode, struct file *file)
+{
+ struct kioctx *ioctx = file->private_data;
+ if (ioctx) {
+ file->private_data = 0;
+ spin_lock_irq(&ioctx->ctx_lock);
+ ioctx->file = 0;
+ spin_unlock_irq(&ioctx->ctx_lock);
+ fput(file);
+ }
+ return 0;
+}
+
+static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
+{ unsigned int pollflags = 0;
+ struct kioctx *ioctx = file->private_data;
+
+ if (ioctx) {
+
+ spin_lock_irq(&ioctx->ctx_lock);
+ /* Insert inside our poll wait queue */
+ poll_wait(file, &ioctx->poll_wait, wait);
+
+ /* Check our condition */
+ if (aio_read_evt(ioctx, 0))
+ pollflags = POLLIN | POLLRDNORM;
+ spin_unlock_irq(&ioctx->ctx_lock);
+ }
+
+ return pollflags;
+}
+
+static const struct file_operations aioq_fops = {
+ .release = aio_queue_fd_close,
+ .poll = aio_queue_fd_poll
+};
+
+/* make_aio_fd:
+ * Create a file descriptor that can be used to poll the event queue.
+ * Based on the excellent epoll code.
+ */
+
+static int make_aio_fd(struct kioctx *ioctx)
+{
+ int fd;
+ struct file *file;
+
+ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
+ if (fd < 0)
+ return fd;
+
+ /* associate the file with the IO context */
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+ file->private_data = ioctx;
+ ioctx->file = file;
+ init_waitqueue_head(&ioctx->poll_wait);
+ return fd;
+}
+#endif
+
/* sys_io_setup:
* Create an aio_context capable of receiving at least nr_events.
* ctxp must not point to an aio_context that already exists, and
* resources are available. May fail with -EFAULT if an invalid
* pointer is passed for ctxp. Will fail with -ENOSYS if not
* implemented.
+ *
+ * To request a selectable fd, the user context has to be initialized
+ * to 1, instead of 0, and the return value is the fd.
+ * This keeps the system call compatible, since a non-zero value
+ * was not allowed so far.
*/
asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
{
struct kioctx *ioctx = NULL;
unsigned long ctx;
long ret;
+ int make_fd = 0;
ret = get_user(ctx, ctxp);
if (unlikely(ret))
goto out;
ret = -EINVAL;
+#ifdef CONFIG_EPOLL
+ if (ctx == 1) {
+ make_fd = 1;
+ ctx = 0;
+ }
+#endif
if (unlikely(ctx || nr_events == 0)) {
pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
ctx, nr_events);
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
- if (!ret)
- return 0;
+#ifdef CONFIG_EPOLL
+ if (make_fd && ret >= 0)
+ ret = make_aio_fd(ioctx);
+#endif
+ if (ret >= 0)
+ return ret;
get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
io_destroy(ioctx);
#include <asm/fbio.h>
#endif
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/public/evtchn.h>
+#include <xen/public/privcmd.h>
+#include <xen/compat_ioctl.h>
+#endif
+
static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
unsigned long arg, struct file *f)
{
IGNORE_IOCTL(FBIOSCURSOR32)
IGNORE_IOCTL(FBIOGCURSOR32)
#endif
+
+#ifdef CONFIG_XEN
+HANDLE_IOCTL(IOCTL_PRIVCMD_MMAP_32, privcmd_ioctl_32)
+HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_32, privcmd_ioctl_32)
+COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_UNBOUND_PORT)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_UNBIND)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_NOTIFY)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_RESET)
+#endif
};
#define IOCTL_HASHSIZE 256
if (!access_ok(VERIFY_READ, base, len))
break;
+ if (unlikely(!access_ok(VERIFY_READ, base, len)))
+ break;
+
/*
* Get this base offset and number of pages, then map
* in the user pages.
{
a_list_t *aentry;
-#ifdef CONFIG_XEN
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
/*
* Xen needs to be able to make sure it can get an exclusive
* RO mapping of pages it wants to turn into a pagetable. If
return root;
}
+#ifndef pcibios_scan_all_fns
#define pcibios_scan_all_fns(a, b) 0
+#endif
#ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
}
#endif
+#ifndef arch_change_pte_range
+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
+#endif
+
#ifndef __HAVE_ARCH_PTE_SAME
#define pte_same(A,B) (pte_val(A) == pte_val(B))
#endif
#include <asm/processor.h>
#include <asm/mmu.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/platform.h>
+#endif
+
#define COMPILER_DEPENDENT_INT64 long long
#define COMPILER_DEPENDENT_UINT64 unsigned long long
/* early initialization routine */
extern void acpi_reserve_bootmem(void);
+#ifdef CONFIG_XEN
+static inline int acpi_notify_hypervisor_state(u8 sleep_state,
+ u32 pm1a_cnt_val,
+ u32 pm1b_cnt_val)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_enter_acpi_sleep,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u = {
+ .enter_acpi_sleep = {
+ .pm1a_cnt_val = pm1a_cnt_val,
+ .pm1b_cnt_val = pm1b_cnt_val,
+ .sleep_state = sleep_state,
+ },
+ },
+ };
+
+ return HYPERVISOR_platform_op(&op);
+}
+#endif /* CONFIG_XEN */
+
/*
* Check if the CPU can handle C2 and deeper
*/
#endif /* !CONFIG_ACPI */
+#ifndef CONFIG_XEN
#define ARCH_HAS_POWER_INIT 1
+#endif
struct bootnode;
#include <linux/pm.h>
#include <linux/delay.h>
+#ifndef CONFIG_XEN
#include <asm/fixmap.h>
+#endif
#include <asm/apicdef.h>
#include <asm/processor.h>
#include <asm/system.h>
+#ifndef CONFIG_XEN
#define ARCH_APICTIMER_STOPS_ON_C3 1
+#endif
#define Dprintk(x...)
#ifdef CONFIG_X86_LOCAL_APIC
extern int apic_verbosity;
+#ifndef CONFIG_XEN
extern int timer_over_8254;
extern int local_apic_timer_c2_ok;
extern int local_apic_timer_disabled;
/* Docs say use 0 for future compatibility */
apic_write_around(APIC_EOI, 0);
}
+#endif
extern int lapic_get_maxlvt(void);
extern void clear_local_APIC(void);
#ifndef _ASM_X86_APICDEF_H
#define _ASM_X86_APICDEF_H
+#ifndef CONFIG_XEN
+
/*
* Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
*
#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
+#else /* CONFIG_XEN */
+
+enum {
+ APIC_DEST_ALLBUT = 0x1,
+ APIC_DEST_SELF,
+ APIC_DEST_ALLINC
+};
+
+#endif /* CONFIG_XEN */
+
#ifdef CONFIG_X86_32
# define MAX_IO_APICS 64
#else
# define MAX_LOCAL_APIC 32768
#endif
+#ifndef CONFIG_XEN
+
/*
* All x86-64 systems are xAPIC compatible.
* In the following, "apicid" is a physical APIC ID.
#undef u32
+#endif /* CONFIG_XEN */
+
#ifdef CONFIG_X86_32
#define BAD_APICID 0xFFu
#else
int direction)
{
BUG_ON(!valid_dma_direction(direction));
+#ifndef CONFIG_XEN
return dma_ops->map_single(dev, page_to_phys(page)+offset,
size, direction);
+#else
+ return dma_ops->map_single(dev, page_to_pseudophys(page)+offset,
+ size, direction);
+#endif
}
static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
extern struct genapic apic_physflat;
extern int acpi_madt_oem_check(char *, char *);
+#ifndef CONFIG_XEN
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
extern enum uv_system_type get_uv_system_type(void);
extern int is_uv_system(void);
DECLARE_PER_CPU(int, x2apic_extra_bits);
extern void uv_cpu_init(void);
extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
+#else
+#define is_uv_system() 0
+#define uv_cpu_init() ((void)0)
+#endif
#endif
extern spinlock_t i8253_lock;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+
extern struct clock_event_device *global_clock_event;
extern void setup_pit_timer(void);
+#endif
+
#define inb_pit inb_p
#define outb_pit outb_p
#ifndef __ASM_IPI_H
#define __ASM_IPI_H
+#ifndef CONFIG_XEN
+
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
local_irq_restore(flags);
}
+#endif /* CONFIG_XEN */
+
#endif /* __ASM_IPI_H */
return ((irq == 2) ? 9 : irq);
}
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
# define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */
#endif
unsigned long start_address) ATTRIB_NORET;
#endif
+/* Under Xen we need to work with machine addresses. These macros give the
+ * machine address of a certain page to the generic kexec code instead of
+ * the pseudo physical address which would be given by the default macros.
+ */
+
+#ifdef CONFIG_XEN
+#define KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _KEXEC_H */
--- /dev/null
+#ifndef _ASM_X86_AGP_H
+#define _ASM_X86_AGP_H
+
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/system.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU. The
+ * GART gives the CPU a physical alias of pages in memory. The alias
+ * region is mapped uncacheable. Make sure there are no conflicting
+ * mappings with different cachability attributes for the same
+ * page. This avoids data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) ( \
+ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
+ ?: set_pages_uc(page, 1))
+#define unmap_page_from_agp(page) ( \
+ xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
+ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+ set_pages_wb(page, 1))
+
+/*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+ * need to be called for each cacheline of the whole page so it may
+ * not be worth it. Would need a page for it.
+ */
+#define flush_agp_cache() wbinvd()
+
+/* Convert a physical address to an address suitable for the GART. */
+#define phys_to_gart(x) phys_to_machine(x)
+#define gart_to_phys(x) machine_to_phys(x)
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order) ({ \
+ char *_t; dma_addr_t _d; \
+ _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
+ _t; })
+#define free_gatt_pages(table, order) \
+ dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
+
+#endif
--- /dev/null
+#ifndef _ASM_DESC_H_
+#define _ASM_DESC_H_
+
+#ifndef __ASSEMBLY__
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+#include <linux/smp.h>
+
+static inline void fill_ldt(struct desc_struct *desc,
+ const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+ desc->base0 = info->base_addr & 0x0000ffff;
+
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+#endif
+
+#ifdef CONFIG_X86_64
+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
+extern struct desc_ptr cpu_gdt_descr[];
+/* the cpu gdt accessor */
+#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate->offset_low = PTR_LOW(func);
+ gate->segment = __KERNEL_CS;
+ gate->ist = ist;
+ gate->p = 1;
+ gate->dpl = dpl;
+ gate->zero0 = 0;
+ gate->zero1 = 0;
+ gate->type = type;
+ gate->offset_middle = PTR_MIDDLE(func);
+ gate->offset_high = PTR_HIGH(func);
+}
+
+#else
+struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+ return per_cpu(gdt_page, cpu).gdt;
+}
+
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+ unsigned long base, unsigned dpl, unsigned flags,
+ unsigned short seg)
+{
+ gate->a = (seg << 16) | (base & 0xffff);
+ gate->b = (base & 0xffff0000) |
+ (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+ const u32 *desc = ptr;
+ return !(desc[0] | desc[1]);
+}
+
+#ifndef CONFIG_XEN
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) \
+ native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) \
+ native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) \
+ native_write_idt_entry(dt, entry, g)
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry,
+ const gate_desc *gate)
+{
+ memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
+ const void *desc)
+{
+ memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
+ const void *desc, int type)
+{
+ unsigned int size;
+ switch (type) {
+ case DESC_TSS:
+ size = sizeof(tss_desc);
+ break;
+ case DESC_LDT:
+ size = sizeof(ldt_desc);
+ break;
+ default:
+ size = sizeof(struct desc_struct);
+ break;
+ }
+ memcpy(&gdt[entry], desc, size);
+}
+#endif
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+ unsigned long limit, unsigned char type,
+ unsigned char flags)
+{
+ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
+ ((flags & 0xf) << 20);
+ desc->p = 1;
+}
+
+
+#ifndef CONFIG_XEN
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+ unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+ struct ldttss_desc64 *desc = d;
+ memset(desc, 0, sizeof(*desc));
+ desc->limit0 = size & 0xFFFF;
+ desc->base0 = PTR_LOW(addr);
+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+ desc->base3 = PTR_HIGH(addr);
+#else
+ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+ struct desc_struct *d = get_cpu_gdt_table(cpu);
+ tss_desc tss;
+
+ /*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap. See tss_struct definition in processor.h
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
+ sizeof(unsigned long) - 1);
+ write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+ if (likely(entries == 0))
+ asm volatile("lldt %w0"::"q" (0));
+ else {
+ unsigned cpu = smp_processor_id();
+ ldt_desc ldt;
+
+ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+ entries * LDT_ENTRY_SIZE - 1);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ &ldt, DESC_LDT);
+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+ }
+}
+
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+ asm volatile("str %0":"=r" (tr));
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ unsigned int i;
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+#else
+#define load_TLS(t, cpu) xen_load_tls(t, cpu)
+#define set_ldt xen_set_ldt
+
+extern int write_ldt_entry(struct desc_struct *ldt, int entry,
+ const void *desc);
+extern int write_gdt_entry(struct desc_struct *gdt, int entry,
+ const void *desc, int type);
+
+static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ unsigned int i;
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
+ *(u64 *)&t->tls_array[i]))
+ BUG();
+}
+#endif
+
+#define _LDT_empty(info) \
+ ((info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+ set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+ set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+ preempt_disable();
+ load_LDT_nolock(pc);
+ preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+ return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+ return desc->limit0 | (desc->limit << 16);
+}
+
+#ifndef CONFIG_X86_NO_IDT
+static inline void _set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate_desc s;
+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+ /*
+ * does not need to be atomic because it is only done once at
+ * setup time
+ */
+ write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_system_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+#ifdef CONFIG_X86_32
+ _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+#else
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+#endif
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+#endif
+
+#else
+/*
+ * GET_DESC_BASE reads the descriptor base of the specified segment.
+ *
+ * Args:
+ * idx - descriptor index
+ * gdt - GDT pointer
+ * base - 32bit register to which the base will be written
+ * lo_w - lo word of the "base" register
+ * lo_b - lo byte of the "base" register
+ * hi_b - hi byte of the low word of the "base" register
+ *
+ * Example:
+ * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+ */
+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+ movb idx * 8 + 4(gdt), lo_b; \
+ movb idx * 8 + 7(gdt), hi_b; \
+ shll $16, base; \
+ movw idx * 8 + 2(gdt), lo_w;
+
+
+#endif /* __ASSEMBLY__ */
+
+#endif
--- /dev/null
+#ifndef _ASM_DMA_MAPPING_H_
+
+#include "../../dma-mapping.h"
+
+static inline int
+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+{
+ dma_addr_t mask = 0xffffffff;
+ /* If the device has a mask, use it, otherwise default to 32 bits */
+ if (hwdev && hwdev->dma_mask)
+ mask = *hwdev->dma_mask;
+ return (addr & ~mask) != 0;
+}
+
+extern int range_straddles_page_boundary(paddr_t p, size_t size);
+
+#endif /* _ASM_DMA_MAPPING_H_ */
--- /dev/null
+#ifndef __ASM_E820_H
+#define __ASM_E820_H
+#define E820MAP 0x2d0 /* our map */
+#define E820MAX 128 /* number of entries in E820MAP */
+#define E820NR 0x1e8 /* # entries in E820MAP */
+
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3
+#define E820_NVS 4
+
+#ifndef __ASSEMBLY__
+struct e820entry {
+ __u64 addr; /* start of memory segment */
+ __u64 size; /* size of memory segment */
+ __u32 type; /* type of memory segment */
+} __attribute__((packed));
+
+struct e820map {
+ __u32 nr_map;
+ struct e820entry map[E820MAX];
+};
+#endif /* __ASSEMBLY__ */
+
+#define ISA_START_ADDRESS 0xa0000
+#define ISA_END_ADDRESS 0x100000
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
+#ifdef __KERNEL__
+#ifdef CONFIG_X86_32
+# include "../../e820_32.h"
+#else
+# include "e820_64.h"
+#endif
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_E820_H */
--- /dev/null
+/*
+ * structures and definitions for the int 15, ax=e820 memory map
+ * scheme.
+ *
+ * In a nutshell, setup.S populates a scratch table in the
+ * empty_zero_block that contains a list of usable address/size
+ * duples. setup.c, this information is transferred into the e820map,
+ * and in init.c/numa.c, that new information is used to mark pages
+ * reserved or not.
+ */
+#ifndef __E820_HEADER
+#define __E820_HEADER
+
+#include <linux/ioport.h>
+
+#ifndef __ASSEMBLY__
+extern unsigned long find_e820_area(unsigned long start, unsigned long end,
+ unsigned long size, unsigned long align);
+extern unsigned long find_e820_area_size(unsigned long start,
+ unsigned long *sizep,
+ unsigned long align);
+extern void add_memory_region(unsigned long start, unsigned long size,
+ int type);
+extern void update_memory_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type);
+extern void setup_memory_region(void);
+extern void contig_e820_setup(void);
+extern unsigned long e820_end_of_ram(void);
+extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
+extern void e820_mark_nosave_regions(void);
+extern int e820_any_mapped(unsigned long start, unsigned long end,
+ unsigned type);
+extern int e820_all_mapped(unsigned long start, unsigned long end,
+ unsigned type);
+extern int e820_any_non_reserved(unsigned long start, unsigned long end);
+extern int is_memory_any_valid(unsigned long start, unsigned long end);
+extern int e820_all_non_reserved(unsigned long start, unsigned long end);
+extern int is_memory_all_valid(unsigned long start, unsigned long end);
+extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
+
+extern void e820_setup_gap(struct e820entry *e820, int nr_map);
+extern void e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+
+extern void finish_e820_parsing(void);
+
+extern struct e820map e820;
+extern void update_e820(void);
+
+extern void reserve_early(unsigned long start, unsigned long end, char *name);
+extern void free_early(unsigned long start, unsigned long end);
+extern void early_res_to_bootmem(unsigned long start, unsigned long end);
+
+#endif/*!__ASSEMBLY__*/
+
+#endif/*__E820_HEADER*/
--- /dev/null
+#ifndef _ASM_FIXMAP_H
+#define _ASM_FIXMAP_H
+
+#ifdef CONFIG_X86_32
+# include "fixmap_32.h"
+#else
+# include "fixmap_64.h"
+#endif
+
+#define clear_fixmap(idx) \
+ __set_fixmap(idx, 0, __pgprot(0))
+
+#endif
--- /dev/null
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#ifndef _ASM_FIXMAP_32_H
+#define _ASM_FIXMAP_32_H
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#endif
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process. We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * these 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages. (or larger if used with an increment
+ * highger than 1) use fixmap_set(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+ FIX_HOLE,
+ FIX_VDSO,
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+#endif
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+ FIX_F00F_IDT, /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_HIGHMEM
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#endif
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+#endif
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS 256
+ FIX_ISAMAP_END,
+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+ __end_of_permanent_fixed_addresses,
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 512 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_NESTING 4
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
+ (__end_of_permanent_fixed_addresses & 511),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
+ FIX_WP_TEST,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+ __end_of_fixed_addresses
+};
+
+extern void __set_fixmap(enum fixed_addresses idx,
+ maddr_t phys, pgprot_t flags);
+extern void reserve_top_address(unsigned long reserve);
+
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL)
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+
+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
+#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
+
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without tranlation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ /*
+ * this branch gets completely eliminated after inlining,
+ * except when someone tries to use fixaddr indices in an
+ * illegal way. (such as mixing up address types or using
+ * out-of-range indices).
+ *
+ * If it doesn't get removed, the linker will complain
+ * loudly with a reasonably clear error message..
+ */
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
+
+ return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif
--- /dev/null
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ */
+
+#ifndef _ASM_FIXMAP_64_H
+#define _ASM_FIXMAP_64_H
+
+#include <linux/kernel.h>
+#include <asm/page.h>
+#include <asm/vsyscall.h>
+#include <asm/efi.h>
+#include <asm/acpi.h>
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+
+enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VSYSCALL_HPET,
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+ FIX_HPET_BASE,
+#ifndef CONFIG_XEN
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#ifdef CONFIG_EFI
+ FIX_EFI_IO_MAP_LAST_PAGE,
+ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
+ + MAX_EFI_IO_PAGES - 1,
+#endif
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+ FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS 256
+ FIX_ISAMAP_END,
+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+ __end_of_permanent_fixed_addresses,
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 512 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_NESTING 4
+ FIX_BTMAP_END =
+ __end_of_permanent_fixed_addresses + 512 -
+ (__end_of_permanent_fixed_addresses & 511),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+ __end_of_fixed_addresses
+};
+
+extern void __set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags);
+
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL)
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
+
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ /*
+ * this branch gets completely eliminated after inlining,
+ * except when someone tries to use fixaddr indices in an
+ * illegal way. (such as mixing up address types or using
+ * out-of-range indices).
+ *
+ * If it doesn't get removed, the linker will complain
+ * loudly with a reasonably clear error message..
+ */
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
+
+ return __fix_to_virt(idx);
+}
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_I386_GNTTAB_DMA_H
+#define _ASM_I386_GNTTAB_DMA_H
+
+static inline int gnttab_dma_local_pfn(struct page *page)
+{
+ /* Has it become a local MFN? */
+ return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
+}
+
+static inline maddr_t gnttab_dma_map_page(struct page *page)
+{
+ __gnttab_dma_map_page(page);
+ return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
+}
+
+static inline void gnttab_dma_unmap_page(maddr_t maddr)
+{
+ __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
+}
+
+#endif /* _ASM_I386_GNTTAB_DMA_H */
--- /dev/null
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#ifndef _ASM_HIGHMEM_H
+#define _ASM_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#include <asm/tlbflush.h>
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+/*
+ * Ordering is:
+ *
+ * FIXADDR_TOP
+ * fixed_addresses
+ * FIXADDR_START
+ * temp fixed addresses
+ * FIXADDR_BOOT_START
+ * Persistent kmap area
+ * PKMAP_BASE
+ * VMALLOC_END
+ * Vmalloc area
+ * VMALLOC_START
+ * high_memory
+ */
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+void *kmap(struct page *page);
+void kunmap(struct page *page);
+void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
+void *kmap_atomic(struct page *page, enum km_type type);
+void *kmap_atomic_pte(struct page *page, enum km_type type);
+void kunmap_atomic(void *kvaddr, enum km_type type);
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *kmap_atomic_to_page(void *ptr);
+
+#define kmap_atomic_pte(page, type) \
+ kmap_atomic_prot(page, type, \
+ PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot)
+
+#define flush_cache_kmaps() do { } while (0)
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_HIGHMEM_H */
--- /dev/null
+#ifdef CONFIG_X86_32
+# include "hw_irq_32.h"
+#else
+# include "hw_irq_64.h"
+#endif
--- /dev/null
+#ifndef _ASM_HW_IRQ_H
+#define _ASM_HW_IRQ_H
+
+/*
+ * linux/include/asm/hw_irq.h
+ *
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * moved some of the old arch/i386/kernel/irq.h to here. VY
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ */
+
+#include <linux/profile.h>
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/sections.h>
+
+/*
+ * Various low-level irq details needed by irq.c, process.c,
+ * time.c, io_apic.c and smp.c
+ *
+ * Interrupt entry/exit code at both C and assembly level
+ */
+
+extern void (*const interrupt[NR_IRQS])(void);
+
+#ifdef CONFIG_SMP
+void reschedule_interrupt(void);
+void invalidate_interrupt(void);
+void call_function_interrupt(void);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void apic_timer_interrupt(void);
+void error_interrupt(void);
+void spurious_interrupt(void);
+void thermal_interrupt(void);
+#define platform_legacy_irq(irq) ((irq) < 16)
+#endif
+
+void disable_8259A_irq(unsigned int irq);
+void enable_8259A_irq(unsigned int irq);
+int i8259A_irq_pending(unsigned int irq);
+void make_8259A_irq(unsigned int irq);
+void init_8259A(int aeoi);
+void send_IPI_self(int vector);
+void init_VISWS_APIC_irqs(void);
+void setup_IO_APIC(void);
+void disable_IO_APIC(void);
+#define print_IO_APIC()
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+void send_IPI(int dest, int vector);
+void setup_ioapic_dest(void);
+
+extern unsigned long io_apic_irqs;
+
+#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
+
+#endif /* _ASM_HW_IRQ_H */
--- /dev/null
+#ifndef _ASM_HW_IRQ_H
+#define _ASM_HW_IRQ_H
+
+/*
+ * linux/include/asm/hw_irq.h
+ *
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * moved some of the old arch/i386/kernel/irq.h to here. VY
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ *
+ * hacked by Andi Kleen for x86-64.
+ */
+
+#ifndef __ASSEMBLY__
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <linux/profile.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#endif
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR 0x20
+
+#define IA32_SYSCALL_VECTOR 0x80
+
+#ifndef CONFIG_XEN
+
+/* Reserve the lowest usable priority level 0x20 - 0x2f for triggering
+ * cleanup after irq migration.
+ */
+#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+
+/*
+ * Vectors 0x30-0x3f are used for ISA interrupts.
+ */
+#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
+#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
+#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
+#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
+#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
+#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
+#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
+#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
+#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
+#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
+#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
+#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
+#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
+#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
+#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
+#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ * some of the following vectors are 'rare', they are merged
+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ * TLB, reschedule and local APIC vectors are performance-critical.
+ */
+#define SPURIOUS_APIC_VECTOR 0xff
+#define ERROR_APIC_VECTOR 0xfe
+#define RESCHEDULE_VECTOR 0xfd
+#define CALL_FUNCTION_VECTOR 0xfc
+/* fb free - please don't readd KDB here because it's useless
+ (hint - think what a NMI bit does to a vector) */
+#define THERMAL_APIC_VECTOR 0xfa
+#define THRESHOLD_APIC_VECTOR 0xf9
+/* f8 free */
+#define INVALIDATE_TLB_VECTOR_END 0xf7
+#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
+
+#define NUM_INVALIDATE_TLB_VECTORS 8
+
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR 0xef
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee)
+ * we start at 0x41 to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
+#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */
+
+#endif
+
+#ifndef __ASSEMBLY__
+
+/* Interrupt handlers registered during init_IRQ */
+void apic_timer_interrupt(void);
+void spurious_interrupt(void);
+void error_interrupt(void);
+void reschedule_interrupt(void);
+void call_function_interrupt(void);
+void irq_move_cleanup_interrupt(void);
+void invalidate_interrupt0(void);
+void invalidate_interrupt1(void);
+void invalidate_interrupt2(void);
+void invalidate_interrupt3(void);
+void invalidate_interrupt4(void);
+void invalidate_interrupt5(void);
+void invalidate_interrupt6(void);
+void invalidate_interrupt7(void);
+void thermal_interrupt(void);
+void threshold_interrupt(void);
+void i8254_timer_resume(void);
+
+typedef int vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
+extern void __setup_vector_irq(int cpu);
+extern spinlock_t vector_lock;
+
+/*
+ * Various low-level irq details needed by irq.c, process.c,
+ * time.c, io_apic.c and smp.c
+ *
+ * Interrupt entry/exit code at both C and assembly level
+ */
+
+extern void disable_8259A_irq(unsigned int irq);
+extern void enable_8259A_irq(unsigned int irq);
+extern int i8259A_irq_pending(unsigned int irq);
+extern void make_8259A_irq(unsigned int irq);
+extern void init_8259A(int aeoi);
+extern void send_IPI_self(int vector);
+extern void init_VISWS_APIC_irqs(void);
+extern void setup_IO_APIC(void);
+extern void enable_IO_APIC(void);
+extern void disable_IO_APIC(void);
+#define print_IO_APIC()
+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern void send_IPI(int dest, int vector);
+extern void setup_ioapic_dest(void);
+extern void native_init_IRQ(void);
+
+extern unsigned long io_apic_irqs;
+
+#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
+
+#include <asm/ptrace.h>
+
+#define IRQ_NAME2(nr) nr##_interrupt(void)
+#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+
+/*
+ * SMP has a few special interrupts for IPI messages
+ */
+
+#define BUILD_IRQ(nr) \
+ asmlinkage void IRQ_NAME(nr); \
+ asm("\n.p2align\n" \
+ "IRQ" #nr "_interrupt:\n\t" \
+ "push $~(" #nr ") ; " \
+ "jmp common_interrupt");
+
+#define platform_legacy_irq(irq) ((irq) < 16)
+
+#endif
+
+#endif /* _ASM_HW_IRQ_H */
--- /dev/null
+#ifdef CONFIG_X86_32
+# include "hypercall_32.h"
+#else
+# include "hypercall_64.h"
+#endif
--- /dev/null
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+
+#include <linux/string.h> /* memcpy() */
+#include <linux/stringify.h>
+
+#ifndef __HYPERVISOR_H__
+# error "please don't include this file directly"
+#endif
+
+#ifdef CONFIG_XEN
+#define HYPERCALL_STR(name) \
+ "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
+#else
+#define HYPERCALL_STR(name) \
+ "mov hypercall_stubs,%%eax; " \
+ "add $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
+ "call *%%eax"
+#endif
+
+#define _hypercall0(type, name) \
+({ \
+ type __res; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res) \
+ : \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall1(type, name, a1) \
+({ \
+ type __res; \
+ long __ign1; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=b" (__ign1) \
+ : "1" ((long)(a1)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ type __res; \
+ long __ign1, __ign2; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
+ : "1" ((long)(a1)), "2" ((long)(a2)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
+ "=d" (__ign3) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3, __ign4; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
+ "=d" (__ign3), "=S" (__ign4) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)), "4" ((long)(a4)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3, __ign4, __ign5; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
+ "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)), "4" ((long)(a4)), \
+ "5" ((long)(a5)) \
+ : "memory" ); \
+ __res; \
+})
+
+static inline int __must_check
+HYPERVISOR_set_trap_table(
+ const trap_info_t *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int __must_check
+HYPERVISOR_mmu_update(
+ mmu_update_t *req, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_mmuext_op(
+ struct mmuext_op *op, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_gdt(
+ unsigned long *frame_list, unsigned int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int __must_check
+HYPERVISOR_stack_switch(
+ unsigned long ss, unsigned long esp)
+{
+ return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+ unsigned long event_selector, unsigned long event_address,
+ unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+ return _hypercall4(int, set_callbacks,
+ event_selector, event_address,
+ failsafe_selector, failsafe_address);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+ int set)
+{
+ return _hypercall1(int, fpu_taskswitch, set);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op_compat(
+ int cmd, unsigned long arg)
+{
+ return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op, cmd, arg);
+}
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+ u64 timeout)
+{
+ unsigned long timeout_hi = (unsigned long)(timeout>>32);
+ unsigned long timeout_lo = (unsigned long)timeout;
+ return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
+}
+
+static inline int __must_check
+HYPERVISOR_platform_op(
+ struct xen_platform_op *platform_op)
+{
+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, platform_op);
+}
+
+static inline int __must_check
+HYPERVISOR_set_debugreg(
+ unsigned int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_get_debugreg(
+ unsigned int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+ u64 ma, u64 desc)
+{
+ return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
+}
+
+static inline int __must_check
+HYPERVISOR_memory_op(
+ unsigned int cmd, void *arg)
+{
+ return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_multicall(
+ multicall_entry_t *call_list, unsigned int nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+ unsigned long va, pte_t new_val, unsigned long flags)
+{
+ unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+ pte_hi = new_val.pte_high;
+#endif
+ return _hypercall4(int, update_va_mapping, va,
+ new_val.pte_low, pte_hi, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_event_channel_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ struct evtchn_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_acm_op(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, acm_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xen_version(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_console_io(
+ int cmd, unsigned int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int __must_check
+HYPERVISOR_physdev_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ struct physdev_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_grant_table_op(
+ unsigned int cmd, void *uop, unsigned int count)
+{
+ return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+ unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+ pte_hi = new_val.pte_high;
+#endif
+ return _hypercall5(int, update_va_mapping_otherdomain, va,
+ new_val.pte_low, pte_hi, flags, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_vm_assist(
+ unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int __must_check
+HYPERVISOR_vcpu_op(
+ int cmd, unsigned int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int __must_check
+HYPERVISOR_suspend(
+ unsigned long srec)
+{
+ struct sched_shutdown sched_shutdown = {
+ .reason = SHUTDOWN_suspend
+ };
+
+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
+ &sched_shutdown, srec);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
+ SHUTDOWN_suspend, srec);
+#endif
+
+ return rc;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int
+HYPERVISOR_nmi_op(
+ unsigned long op, void *arg)
+{
+ return _hypercall2(int, nmi_op, op, arg);
+}
+#endif
+
+#ifndef CONFIG_XEN
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(
+ int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_callback_op(
+ int cmd, const void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xenoprof_op(
+ int op, void *arg)
+{
+ return _hypercall2(int, xenoprof_op, op, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_kexec_op(
+ unsigned long op, void *args)
+{
+ return _hypercall2(int, kexec_op, op, args);
+}
+
+
+
+#endif /* __HYPERCALL_H__ */
--- /dev/null
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * 64-bit updates:
+ * Benjamin Liu <benjamin.liu@intel.com>
+ * Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+
+#include <linux/string.h> /* memcpy() */
+#include <linux/stringify.h>
+
+#ifndef __HYPERVISOR_H__
+# error "please don't include this file directly"
+#endif
+
+#ifdef CONFIG_XEN
+#define HYPERCALL_STR(name) \
+ "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)"
+#else
+#define HYPERCALL_STR(name) \
+ "mov $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\
+ "add hypercall_stubs(%%rip),%%rax; " \
+ "call *%%rax"
+#endif
+
+#define _hypercall0(type, name) \
+({ \
+ type __res; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res) \
+ : \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall1(type, name, a1) \
+({ \
+ type __res; \
+ long __ign1; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1) \
+ : "1" ((long)(a1)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ type __res; \
+ long __ign1, __ign2; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
+ : "1" ((long)(a1)), "2" ((long)(a2)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ register long __arg4 asm("r10") = (long)(a4); \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3), "+r" (__arg4) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
+({ \
+ type __res; \
+ long __ign1, __ign2, __ign3; \
+ register long __arg4 asm("r10") = (long)(a4); \
+ register long __arg5 asm("r8") = (long)(a5); \
+ asm volatile ( \
+ HYPERCALL_STR(name) \
+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
+ "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \
+ : "1" ((long)(a1)), "2" ((long)(a2)), \
+ "3" ((long)(a3)) \
+ : "memory" ); \
+ __res; \
+})
+
+static inline int __must_check
+HYPERVISOR_set_trap_table(
+ const trap_info_t *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int __must_check
+HYPERVISOR_mmu_update(
+ mmu_update_t *req, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_mmuext_op(
+ struct mmuext_op *op, unsigned int count, unsigned int *success_count,
+ domid_t domid)
+{
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_gdt(
+ unsigned long *frame_list, unsigned int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int __must_check
+HYPERVISOR_stack_switch(
+ unsigned long ss, unsigned long esp)
+{
+ return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+ unsigned long event_address, unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address, syscall_address);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+ int set)
+{
+ return _hypercall1(int, fpu_taskswitch, set);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op_compat(
+ int cmd, unsigned long arg)
+{
+ return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_sched_op(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op, cmd, arg);
+}
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+ u64 timeout)
+{
+ return _hypercall1(long, set_timer_op, timeout);
+}
+
+static inline int __must_check
+HYPERVISOR_platform_op(
+ struct xen_platform_op *platform_op)
+{
+ platform_op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, platform_op);
+}
+
+static inline int __must_check
+HYPERVISOR_set_debugreg(
+ unsigned int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_get_debugreg(
+ unsigned int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+ unsigned long ma, unsigned long word)
+{
+ return _hypercall2(int, update_descriptor, ma, word);
+}
+
+static inline int __must_check
+HYPERVISOR_memory_op(
+ unsigned int cmd, void *arg)
+{
+ return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_multicall(
+ multicall_entry_t *call_list, unsigned int nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+ unsigned long va, pte_t new_val, unsigned long flags)
+{
+ return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_event_channel_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ struct evtchn_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_acm_op(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, acm_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xen_version(
+ int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_console_io(
+ int cmd, unsigned int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int __must_check
+HYPERVISOR_physdev_op(
+ int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ struct physdev_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+#endif
+
+ return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_grant_table_op(
+ unsigned int cmd, void *uop, unsigned int count)
+{
+ return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val.pte, flags, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_vm_assist(
+ unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int __must_check
+HYPERVISOR_vcpu_op(
+ int cmd, unsigned int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int __must_check
+HYPERVISOR_set_segment_base(
+ int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+
+static inline int __must_check
+HYPERVISOR_suspend(
+ unsigned long srec)
+{
+ struct sched_shutdown sched_shutdown = {
+ .reason = SHUTDOWN_suspend
+ };
+
+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
+ &sched_shutdown, srec);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
+ SHUTDOWN_suspend, srec);
+#endif
+
+ return rc;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int
+HYPERVISOR_nmi_op(
+ unsigned long op, void *arg)
+{
+ return _hypercall2(int, nmi_op, op, arg);
+}
+#endif
+
+#ifndef CONFIG_XEN
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(
+ int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_callback_op(
+ int cmd, const void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xenoprof_op(
+ int op, void *arg)
+{
+ return _hypercall2(int, xenoprof_op, op, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_kexec_op(
+ unsigned long op, void *args)
+{
+ return _hypercall2(int, kexec_op, op, args);
+}
+
+#endif /* __HYPERCALL_H__ */
--- /dev/null
+/******************************************************************************
+ * hypervisor.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERVISOR_H__
+#define __HYPERVISOR_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/errno.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/nmi.h>
+#include <asm/ptrace.h>
+#include <asm/page.h>
+
+extern shared_info_t *HYPERVISOR_shared_info;
+
+#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
+#ifdef CONFIG_SMP
+#define current_vcpu_info() vcpu_info(smp_processor_id())
+#else
+#define current_vcpu_info() vcpu_info(0)
+#endif
+
+#ifdef CONFIG_X86_32
+extern unsigned long hypervisor_virt_start;
+#endif
+
+/* arch/xen/i386/kernel/setup.c */
+extern start_info_t *xen_start_info;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
+#else
+#define is_initial_xendomain() 0
+#endif
+
+/* arch/xen/kernel/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+void force_evtchn_callback(void);
+
+/* arch/xen/kernel/process.c */
+void xen_cpu_idle (void);
+
+/* arch/xen/i386/kernel/hypervisor.c */
+void do_hypervisor_callback(struct pt_regs *regs);
+
+/* arch/xen/i386/mm/hypervisor.c */
+/*
+ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
+ * be MACHINE addresses.
+ */
+
+void xen_pt_switch(unsigned long ptr);
+void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
+void xen_load_gs(unsigned int selector); /* x86_64 only */
+void xen_tlb_flush(void);
+void xen_invlpg(unsigned long ptr);
+
+void xen_l1_entry_update(pte_t *ptr, pte_t val);
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
+void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
+void xen_pgd_pin(unsigned long ptr);
+void xen_pgd_unpin(unsigned long ptr);
+
+void xen_set_ldt(const void *ptr, unsigned int ents);
+
+#ifdef CONFIG_SMP
+#include <linux/cpumask.h>
+void xen_tlb_flush_all(void);
+void xen_invlpg_all(unsigned long ptr);
+void xen_tlb_flush_mask(cpumask_t *mask);
+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
+#endif
+
+/* Returns zero on success else negative errno. */
+int xen_create_contiguous_region(
+ unsigned long vstart, unsigned int order, unsigned int address_bits);
+void xen_destroy_contiguous_region(
+ unsigned long vstart, unsigned int order);
+
+struct page;
+
+int xen_limit_pages_to_max_mfn(
+ struct page *pages, unsigned int order, unsigned int address_bits);
+
+/* Turn jiffies into Xen system time. */
+u64 jiffies_to_st(unsigned long jiffies);
+
+#ifdef CONFIG_XEN_SCRUB_PAGES
+void scrub_pages(void *, unsigned int);
+#else
+#define scrub_pages(_p,_n) ((void)0)
+#endif
+
+#include <xen/hypercall.h>
+
+#if defined(CONFIG_X86_64)
+#define MULTI_UVMFLAGS_INDEX 2
+#define MULTI_UVMDOMID_INDEX 3
+#else
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#endif
+
+#ifdef CONFIG_XEN
+#define is_running_on_xen() 1
+#else
+extern char *hypercall_stubs;
+#define is_running_on_xen() (!!hypercall_stubs)
+#endif
+
+static inline int
+HYPERVISOR_yield(
+ void)
+{
+ int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+ return rc;
+}
+
+static inline int
+HYPERVISOR_block(
+ void)
+{
+ int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
+#endif
+
+ return rc;
+}
+
+static inline void __noreturn
+HYPERVISOR_shutdown(
+ unsigned int reason)
+{
+ struct sched_shutdown sched_shutdown = {
+ .reason = reason
+ };
+
+ VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
+#if CONFIG_XEN_COMPAT <= 0x030002
+ VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
+#endif
+ /* Don't recurse needlessly. */
+ BUG_ON(reason != SHUTDOWN_crash);
+ for(;;);
+}
+
+static inline int __must_check
+HYPERVISOR_poll(
+ evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
+{
+ int rc;
+ struct sched_poll sched_poll = {
+ .nr_ports = nr_ports,
+ .timeout = jiffies_to_st(timeout)
+ };
+ set_xen_guest_handle(sched_poll.ports, ports);
+
+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (rc == -ENOSYS)
+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+ return rc;
+}
+
+#ifdef CONFIG_XEN
+
+static inline void
+MULTI_update_va_mapping(
+ multicall_entry_t *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+#if defined(CONFIG_X86_64)
+ mcl->args[1] = new_val.pte;
+#elif defined(CONFIG_X86_PAE)
+ mcl->args[1] = new_val.pte_low;
+ mcl->args[2] = new_val.pte_high;
+#else
+ mcl->args[1] = new_val.pte_low;
+ mcl->args[2] = 0;
+#endif
+ mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
+}
+
+static inline void
+MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
+ void *uop, unsigned int count)
+{
+ mcl->op = __HYPERVISOR_grant_table_op;
+ mcl->args[0] = cmd;
+ mcl->args[1] = (unsigned long)uop;
+ mcl->args[2] = count;
+}
+
+#else /* !defined(CONFIG_XEN) */
+
+/* Multicalls not supported for HVM guests. */
+#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
+#define MULTI_grant_table_op(a,b,c,d) ((void)0)
+
+#endif
+
+#endif /* __HYPERVISOR_H__ */
--- /dev/null
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+#define ARCH_HAS_IOREMAP_WC
+
+#ifdef CONFIG_X86_32
+# include "io_32.h"
+#else
+# include "io_64.h"
+#endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
+extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
+extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
+
+extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ unsigned long prot_val);
+extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+
+#endif /* _ASM_X86_IO_H */
--- /dev/null
+#ifndef _ASM_IO_H
+#define _ASM_IO_H
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#define IO_SPACE_LIMIT 0xffff
+
+#define XQUAD_PORTIO_BASE 0xfe400000
+#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
+
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+#include <linux/vmalloc.h>
+#include <asm/fixmap.h>
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+/**
+ * virt_to_phys - map virtual addresses to physical
+ * @address: address to remap
+ *
+ * The returned physical address is the physical (CPU) mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses directly mapped or allocated via kmalloc.
+ *
+ * This function does not give bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline unsigned long virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+/**
+ * phys_to_virt - map physical address to virtual
+ * @address: address to remap
+ *
+ * The returned virtual address is a current CPU mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses that have a kernel mapping
+ *
+ * This function does not handle bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline void *phys_to_virt(unsigned long address)
+{
+ return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
+
+#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
+ (unsigned long) bio_offset((bio)))
+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
+ (unsigned long) (bv)->bv_offset)
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
+ ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
+ bvec_to_pseudophys((vec2))))
+
+/**
+ * ioremap - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ */
+#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
+#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
+#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+/*
+ * readX/writeX() are used to access memory mapped devices. On some
+ * architectures the memory mapped IO stuff needs to be accessed
+ * differently. On the x86 architecture, we just read/write the
+ * memory location directly.
+ */
+
+static inline unsigned char readb(const volatile void __iomem *addr)
+{
+ return *(volatile unsigned char __force *)addr;
+}
+
+static inline unsigned short readw(const volatile void __iomem *addr)
+{
+ return *(volatile unsigned short __force *)addr;
+}
+
+static inline unsigned int readl(const volatile void __iomem *addr)
+{
+ return *(volatile unsigned int __force *) addr;
+}
+
+#define readb_relaxed(addr) readb(addr)
+#define readw_relaxed(addr) readw(addr)
+#define readl_relaxed(addr) readl(addr)
+#define __raw_readb readb
+#define __raw_readw readw
+#define __raw_readl readl
+
+static inline void writeb(unsigned char b, volatile void __iomem *addr)
+{
+ *(volatile unsigned char __force *)addr = b;
+}
+
+static inline void writew(unsigned short b, volatile void __iomem *addr)
+{
+ *(volatile unsigned short __force *)addr = b;
+}
+
+static inline void writel(unsigned int b, volatile void __iomem *addr)
+{
+ *(volatile unsigned int __force *)addr = b;
+}
+#define __raw_writeb writeb
+#define __raw_writew writew
+#define __raw_writel writel
+
+#define mmiowb()
+
+static inline void
+memset_io(volatile void __iomem *addr, unsigned char val, int count)
+{
+ memset((void __force *)addr, val, count);
+}
+
+static inline void
+memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
+{
+ __memcpy(dst, (const void __force *)src, count);
+}
+
+static inline void
+memcpy_toio(volatile void __iomem *dst, const void *src, int count)
+{
+ __memcpy((void __force *)dst, src, count);
+}
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
+
+/*
+ * Cache management
+ *
+ * This needed for two cases
+ * 1. Out of order aware processors
+ * 2. Accidentally out of order processors (PPro errata #51)
+ */
+
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+
+static inline void flush_write_buffers(void)
+{
+ asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+}
+
+#else
+
+#define flush_write_buffers() do { } while (0)
+
+#endif
+
+#endif /* __KERNEL__ */
+
+extern void xen_io_delay(void);
+#define native_io_delay xen_io_delay
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+
+#define __BUILDIO(bwl, bw, type) \
+static inline void out##bwl(unsigned type value, int port) \
+{ \
+ out##bwl##_local(value, port); \
+} \
+ \
+static inline unsigned type in##bwl(int port) \
+{ \
+ return in##bwl##_local(port); \
+}
+
+#define BUILDIO(bwl, bw, type) \
+static inline void out##bwl##_local(unsigned type value, int port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline unsigned type in##bwl##_local(int port) \
+{ \
+ unsigned type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+} \
+ \
+static inline void out##bwl##_local_p(unsigned type value, int port) \
+{ \
+ out##bwl##_local(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_local_p(int port) \
+{ \
+ unsigned type value = in##bwl##_local(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+__BUILDIO(bwl, bw, type) \
+ \
+static inline void out##bwl##_p(unsigned type value, int port) \
+{ \
+ out##bwl(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_p(int port) \
+{ \
+ unsigned type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) : "d"(port)); \
+} \
+ \
+static inline void ins##bwl(int port, void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) : "d"(port)); \
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
+
+/* We will be supplying our own /dev/mem implementation */
+#define ARCH_HAS_DEV_MEM
+
+#endif
--- /dev/null
+#ifndef _ASM_IO_H
+#define _ASM_IO_H
+
+#include <asm/fixmap.h>
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+extern void xen_io_delay(void);
+#define native_io_delay xen_io_delay
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+#endif
+
+/*
+ * Talk about misusing macros..
+ */
+#define __OUT1(s, x) \
+static inline void out##s(unsigned x value, unsigned short port) {
+
+#define __OUT2(s, s1, s2) \
+asm volatile ("out" #s " %" s1 "0,%" s2 "1"
+
+#ifndef REALLY_SLOW_IO
+#define REALLY_SLOW_IO
+#define UNSET_REALLY_SLOW_IO
+#endif
+
+#define __OUT(s, s1, x) \
+ __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+ } \
+ __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+ slow_down_io(); \
+}
+
+#define __IN1(s) \
+static inline RETURN_TYPE in##s(unsigned short port) \
+{ \
+ RETURN_TYPE _v;
+
+#define __IN2(s, s1, s2) \
+ asm volatile ("in" #s " %" s2 "1,%" s1 "0"
+
+#define __IN(s, s1, i...) \
+ __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
+ return _v; \
+ } \
+ __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
+ slow_down_io(); \
+ return _v; }
+
+#ifdef UNSET_REALLY_SLOW_IO
+#undef REALLY_SLOW_IO
+#endif
+
+#define __INS(s) \
+static inline void ins##s(unsigned short port, void *addr, \
+ unsigned long count) \
+{ \
+ asm volatile ("rep ; ins" #s \
+ : "=D" (addr), "=c" (count) \
+ : "d" (port), "0" (addr), "1" (count)); \
+}
+
+#define __OUTS(s) \
+static inline void outs##s(unsigned short port, const void *addr, \
+ unsigned long count) \
+{ \
+ asm volatile ("rep ; outs" #s \
+ : "=S" (addr), "=c" (count) \
+ : "d" (port), "0" (addr), "1" (count)); \
+}
+
+#define RETURN_TYPE unsigned char
+__IN(b, "")
+#undef RETURN_TYPE
+#define RETURN_TYPE unsigned short
+__IN(w, "")
+#undef RETURN_TYPE
+#define RETURN_TYPE unsigned int
+__IN(l, "")
+#undef RETURN_TYPE
+
+__OUT(b, "b", char)
+__OUT(w, "w", short)
+__OUT(l, , int)
+
+__INS(b)
+__INS(w)
+__INS(l)
+
+__OUTS(b)
+__OUTS(w)
+__OUTS(l)
+
+#define IO_SPACE_LIMIT 0xffff
+
+#if defined(__KERNEL__) && defined(__x86_64__)
+
+#include <linux/vmalloc.h>
+
+#ifndef __i386__
+/*
+ * Change virtual addresses to physical addresses and vv.
+ * These are pretty trivial
+ */
+static inline unsigned long virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+static inline void *phys_to_virt(unsigned long address)
+{
+ return __va(address);
+}
+
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+#endif
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
+
+#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
+ (unsigned long) bio_offset((bio)))
+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
+ (unsigned long) (bv)->bv_offset)
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
+ ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
+ bvec_to_pseudophys((vec2))))
+
+#include <asm-generic/iomap.h>
+
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long addr, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
+
+/*
+ * This one maps high address device memory and turns off caching for that area.
+ * it's useful if some control registers are in such an area and write combining
+ * or read caching is not desirable:
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ */
+
+#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
+#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
+#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+/*
+ * readX/writeX() are used to access memory mapped devices. On some
+ * architectures the memory mapped IO stuff needs to be accessed
+ * differently. On the x86 architecture, we just read/write the
+ * memory location directly.
+ */
+
+static inline __u8 __readb(const volatile void __iomem *addr)
+{
+ return *(__force volatile __u8 *)addr;
+}
+
+static inline __u16 __readw(const volatile void __iomem *addr)
+{
+ return *(__force volatile __u16 *)addr;
+}
+
+static __always_inline __u32 __readl(const volatile void __iomem *addr)
+{
+ return *(__force volatile __u32 *)addr;
+}
+
+static inline __u64 __readq(const volatile void __iomem *addr)
+{
+ return *(__force volatile __u64 *)addr;
+}
+
+#define readb(x) __readb(x)
+#define readw(x) __readw(x)
+#define readl(x) __readl(x)
+#define readq(x) __readq(x)
+#define readb_relaxed(a) readb(a)
+#define readw_relaxed(a) readw(a)
+#define readl_relaxed(a) readl(a)
+#define readq_relaxed(a) readq(a)
+#define __raw_readb readb
+#define __raw_readw readw
+#define __raw_readl readl
+#define __raw_readq readq
+
+#define mmiowb()
+
+static inline void __writel(__u32 b, volatile void __iomem *addr)
+{
+ *(__force volatile __u32 *)addr = b;
+}
+
+static inline void __writeq(__u64 b, volatile void __iomem *addr)
+{
+ *(__force volatile __u64 *)addr = b;
+}
+
+static inline void __writeb(__u8 b, volatile void __iomem *addr)
+{
+ *(__force volatile __u8 *)addr = b;
+}
+
+static inline void __writew(__u16 b, volatile void __iomem *addr)
+{
+ *(__force volatile __u16 *)addr = b;
+}
+
+#define writeq(val, addr) __writeq((val), (addr))
+#define writel(val, addr) __writel((val), (addr))
+#define writew(val, addr) __writew((val), (addr))
+#define writeb(val, addr) __writeb((val), (addr))
+#define __raw_writeb writeb
+#define __raw_writew writew
+#define __raw_writel writel
+#define __raw_writeq writeq
+
+void __memcpy_fromio(void *, unsigned long, unsigned);
+void __memcpy_toio(unsigned long, const void *, unsigned);
+
+static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
+ unsigned len)
+{
+ __memcpy_fromio(to, (unsigned long)from, len);
+}
+
+static inline void memcpy_toio(volatile void __iomem *to, const void *from,
+ unsigned len)
+{
+ __memcpy_toio((unsigned long)to, from, len);
+}
+
+void memset_io(volatile void __iomem *a, int b, size_t c);
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
+
+#define flush_write_buffers()
+
+extern int iommu_bio_merge;
+#define BIO_VMERGE_BOUNDARY iommu_bio_merge
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+#endif /* __KERNEL__ */
+
+#define ARCH_HAS_DEV_MEM
+
+#endif
--- /dev/null
+#ifdef CONFIG_X86_32
+# include "../../irq_32.h"
+#else
+# include "irq_64.h"
+#endif
--- /dev/null
+#ifndef _ASM_IRQ_H
+#define _ASM_IRQ_H
+
+/*
+ * linux/include/asm/irq.h
+ *
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ */
+
+#include <linux/sched.h>
+/* include comes from machine specific directory */
+#include "irq_vectors.h"
+#include <asm/thread_info.h>
+
+static inline int irq_canonicalize(int irq)
+{
+ return ((irq == 2) ? 9 : irq);
+}
+
+#ifndef CONFIG_XEN
+#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */
+#endif
+
+#define KDB_VECTOR 0xf9
+
+# define irq_ctx_init(cpu) do { } while (0)
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <linux/cpumask.h>
+extern void fixup_irqs(cpumask_t map);
+#endif
+
+#define __ARCH_HAS_DO_SOFTIRQ 1
+
+#endif /* _ASM_IRQ_H */
--- /dev/null
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/processor-flags.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * The use of 'barrier' in the following reflects their use as local-lock
+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+ * critical operations are executed. All critical operations must complete
+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+ * includes these barriers, for example.
+ */
+
+#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
+
+#define xen_restore_fl(f) \
+do { \
+ vcpu_info_t *_vcpu; \
+ barrier(); \
+ _vcpu = current_vcpu_info(); \
+ if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
+ barrier(); /* unmask then check (avoid races) */\
+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
+ force_evtchn_callback(); \
+ } \
+} while (0)
+
+#define xen_irq_disable() \
+do { \
+ current_vcpu_info()->evtchn_upcall_mask = 1; \
+ barrier(); \
+} while (0)
+
+#define xen_irq_enable() \
+do { \
+ vcpu_info_t *_vcpu; \
+ barrier(); \
+ _vcpu = current_vcpu_info(); \
+ _vcpu->evtchn_upcall_mask = 0; \
+ barrier(); /* unmask then check (avoid races) */ \
+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
+ force_evtchn_callback(); \
+} while (0)
+
+void xen_safe_halt(void);
+
+void xen_halt(void);
+#endif
+
+#ifndef __ASSEMBLY__
+
+#define __raw_local_save_flags(void) xen_save_fl()
+
+#define raw_local_irq_restore(flags) xen_restore_fl(flags)
+
+#define raw_local_irq_disable() xen_irq_disable()
+
+#define raw_local_irq_enable() xen_irq_enable()
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+ xen_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+ xen_halt();
+}
+
+/*
+ * For spinlocks, etc:
+ */
+#define __raw_local_irq_save() \
+({ \
+ unsigned long flags = __raw_local_save_flags(); \
+ \
+ raw_local_irq_disable(); \
+ \
+ flags; \
+})
+#else
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending /* 0 */
+#define evtchn_upcall_mask 1
+
+#define sizeof_vcpu_shift 6
+
+#ifdef CONFIG_X86_64
+# define __REG_si %rsi
+# define __CPU_num %gs:pda_cpunumber
+#else
+# define __REG_si %esi
+# define __CPU_num TI_cpu(%ebp)
+#endif
+
+#ifdef CONFIG_SMP
+#define GET_VCPU_INFO movl __CPU_num,%esi ; \
+ shl $sizeof_vcpu_shift,%esi ; \
+ add HYPERVISOR_shared_info,__REG_si
+#else
+#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si
+#endif
+
+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si)
+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si)
+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si)
+#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
+ __DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
+ __ENABLE_INTERRUPTS
+
+#ifndef CONFIG_X86_64
+#define INTERRUPT_RETURN iret
+#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
+ __TEST_PENDING ; \
+ jnz 14f /* process more events if necessary... */ ; \
+ movl PT_ESI(%esp), %esi ; \
+ sysexit ; \
+14: __DISABLE_INTERRUPTS ; \
+ TRACE_IRQS_OFF ; \
+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
+ push %esp ; \
+ call evtchn_do_upcall ; \
+ add $4,%esp ; \
+ jmp ret_from_intr
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
+
+#define raw_local_irq_save(flags) \
+ do { (flags) = __raw_local_irq_save(); } while (0)
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return (flags != 0);
+}
+
+#define raw_irqs_disabled() \
+({ \
+ unsigned long flags = __raw_local_save_flags(); \
+ \
+ raw_irqs_disabled_flags(flags); \
+})
+
+/*
+ * makes the traced hardirq state match with the machine state
+ *
+ * should be a rarely used function, only in places where its
+ * otherwise impossible to know the irq state, like in traps.
+ */
+static inline void trace_hardirqs_fixup_flags(unsigned long flags)
+{
+ if (raw_irqs_disabled_flags(flags))
+ trace_hardirqs_off();
+ else
+ trace_hardirqs_on();
+}
+
+#define trace_hardirqs_fixup() \
+ trace_hardirqs_fixup_flags(__raw_local_save_flags())
+
+#else
+
+#ifdef CONFIG_X86_64
+/*
+ * Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack). So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack. x86_64 only.
+ */
+#define SWAPGS_UNSAFE_STACK swapgs
+#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
+#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
+#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
+ TRACE_IRQS_ON; \
+ ENABLE_INTERRUPTS(CLBR_NONE); \
+ SAVE_REST; \
+ LOCKDEP_SYS_EXIT; \
+ RESTORE_REST; \
+ __DISABLE_INTERRUPTS; \
+ TRACE_IRQS_OFF;
+
+#else
+#define ARCH_TRACE_IRQS_ON \
+ pushl %eax; \
+ pushl %ecx; \
+ pushl %edx; \
+ call trace_hardirqs_on; \
+ popl %edx; \
+ popl %ecx; \
+ popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF \
+ pushl %eax; \
+ pushl %ecx; \
+ pushl %edx; \
+ call trace_hardirqs_off; \
+ popl %edx; \
+ popl %ecx; \
+ popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT \
+ pushl %eax; \
+ pushl %ecx; \
+ pushl %edx; \
+ call lockdep_sys_exit; \
+ popl %edx; \
+ popl %ecx; \
+ popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
+# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+# define LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
+#endif
--- /dev/null
+#ifdef CONFIG_X86_32
+# include "maddr_32.h"
+#else
+# include "maddr_64.h"
+#endif
--- /dev/null
+#ifndef _I386_MADDR_H
+#define _I386_MADDR_H
+
+#include <asm/bug.h>
+#include <xen/features.h>
+#include <xen/interface/xen.h>
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY (~0UL)
+#define FOREIGN_FRAME_BIT (1UL<<31)
+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+
+/* Definitions for machine and pseudophysical addresses. */
+#ifdef CONFIG_X86_PAE
+typedef unsigned long long paddr_t;
+typedef unsigned long long maddr_t;
+#else
+typedef unsigned long paddr_t;
+typedef unsigned long maddr_t;
+#endif
+
+#ifdef CONFIG_XEN
+
+extern unsigned long *phys_to_machine_mapping;
+extern unsigned long max_mapnr;
+
+#undef machine_to_phys_mapping
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned int machine_to_phys_order;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 1;
+ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+ if (unlikely((mfn >> machine_to_phys_order) != 0))
+ return max_mapnr;
+
+ /* The array access can fail (e.g., device space beyond end of RAM). */
+ asm (
+ "1: movl %1,%0\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl %2,%0\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b,3b\n"
+ ".previous"
+ : "=r" (pfn)
+ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
+
+ return pfn;
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
+ * to be outside our maximum possible pseudophys range.
+ * 2. If the MFN belongs to a different domain then we will certainly
+ * not have MFN in our p2m table. Conversely, if the page is ours,
+ * then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+ unsigned long pfn = mfn_to_pfn(mfn);
+ if ((pfn < max_mapnr)
+ && !xen_feature(XENFEAT_auto_translated_physmap)
+ && (phys_to_machine_mapping[pfn] != mfn))
+ return max_mapnr; /* force !pfn_valid() */
+ return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return;
+ }
+ phys_to_machine_mapping[pfn] = mfn;
+}
+
+static inline maddr_t phys_to_machine(paddr_t phys)
+{
+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+ machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
+ return machine;
+}
+
+static inline paddr_t machine_to_phys(maddr_t machine)
+{
+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+ phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
+ return phys;
+}
+
+#ifdef CONFIG_X86_PAE
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+ /*
+ * In PAE mode, the NX bit needs to be dealt with in the value
+ * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
+ * but for i386 the conversion to ulong for the argument will
+ * clip it off.
+ */
+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+ machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+ return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+ /*
+ * In PAE mode, the NX bit needs to be dealt with in the value
+ * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
+ * but for i386 the conversion to ulong for the argument will
+ * clip it off.
+ */
+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+ return phys;
+}
+#else
+#define pte_phys_to_machine phys_to_machine
+#define pte_machine_to_phys machine_to_phys
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define pfn_to_mfn(pfn) (pfn)
+#define mfn_to_pfn(mfn) (mfn)
+#define mfn_to_local_pfn(mfn) (mfn)
+#define set_phys_to_machine(pfn, mfn) ((void)0)
+#define phys_to_machine_mapping_valid(pfn) (1)
+#define phys_to_machine(phys) ((maddr_t)(phys))
+#define machine_to_phys(mach) ((paddr_t)(mach))
+#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
+#define __pte_ma(x) __pte(x)
+
+#endif /* !CONFIG_XEN */
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v) (phys_to_machine(__pa(v)))
+#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+#endif /* _I386_MADDR_H */
--- /dev/null
+#ifndef _X86_64_MADDR_H
+#define _X86_64_MADDR_H
+
+#include <asm/bug.h>
+#include <xen/features.h>
+#include <xen/interface/xen.h>
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY (~0UL)
+#define FOREIGN_FRAME_BIT (1UL<<63)
+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+
+/* Definitions for machine and pseudophysical addresses. */
+typedef unsigned long paddr_t;
+typedef unsigned long maddr_t;
+
+#ifdef CONFIG_XEN
+
+extern unsigned long *phys_to_machine_mapping;
+extern unsigned long max_mapnr;
+
+#undef machine_to_phys_mapping
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned int machine_to_phys_order;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 1;
+ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+ if (unlikely((mfn >> machine_to_phys_order) != 0))
+ return max_mapnr;
+
+ /* The array access can fail (e.g., device space beyond end of RAM). */
+ asm (
+ "1: movq %1,%0\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movq %2,%0\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 8\n"
+ " .quad 1b,3b\n"
+ ".previous"
+ : "=r" (pfn)
+ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
+
+ return pfn;
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
+ * to be outside our maximum possible pseudophys range.
+ * 2. If the MFN belongs to a different domain then we will certainly
+ * not have MFN in our p2m table. Conversely, if the page is ours,
+ * then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+ unsigned long pfn = mfn_to_pfn(mfn);
+ if ((pfn < max_mapnr)
+ && !xen_feature(XENFEAT_auto_translated_physmap)
+ && (phys_to_machine_mapping[pfn] != mfn))
+ return max_mapnr; /* force !pfn_valid() */
+ return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return;
+ }
+ phys_to_machine_mapping[pfn] = mfn;
+}
+
+static inline maddr_t phys_to_machine(paddr_t phys)
+{
+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+ machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
+ return machine;
+}
+
+static inline paddr_t machine_to_phys(maddr_t machine)
+{
+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+ phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
+ return phys;
+}
+
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+ maddr_t machine;
+ machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+ machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+ return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+ paddr_t phys;
+ phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+ return phys;
+}
+
+#else /* !CONFIG_XEN */
+
+#define pfn_to_mfn(pfn) (pfn)
+#define mfn_to_pfn(mfn) (mfn)
+#define mfn_to_local_pfn(mfn) (mfn)
+#define set_phys_to_machine(pfn, mfn) ((void)0)
+#define phys_to_machine_mapping_valid(pfn) (1)
+#define phys_to_machine(phys) ((maddr_t)(phys))
+#define machine_to_phys(mach) ((paddr_t)(mach))
+#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
+#define __pte_ma(x) __pte(x)
+
+#endif /* !CONFIG_XEN */
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v) (phys_to_machine(__pa(v)))
+#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+#endif /* _X86_64_MADDR_H */
+
--- /dev/null
+#ifdef CONFIG_X86_32
+# include "mmu_context_32.h"
+#else
+# include "mmu_context_64.h"
+#endif
--- /dev/null
+#ifndef __I386_SCHED_H
+#define __I386_SCHED_H
+
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+void arch_exit_mmap(struct mm_struct *mm);
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+void mm_pin(struct mm_struct *mm);
+void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
+
+static inline void xen_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+ if (!PagePinned(virt_to_page(next->pgd)))
+ mm_pin(next);
+}
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#if 0 /* XEN: no lazy tlb */
+ unsigned cpu = smp_processor_id();
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
+#endif
+}
+
+#define prepare_arch_switch(next) __prepare_arch_switch()
+
+static inline void __prepare_arch_switch(void)
+{
+ /*
+ * Save away %gs. No need to save %fs, as it was saved on the
+ * stack on entry. No need to save %es and %ds, as those are
+ * always kernel segments while inside the kernel.
+ */
+ asm volatile ( "mov %%gs,%0"
+ : "=m" (current->thread.gs));
+ asm volatile ( "movl %0,%%gs"
+ : : "r" (0) );
+}
+
+static inline void switch_mm(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ int cpu = smp_processor_id();
+ struct mmuext_op _op[2], *op = _op;
+
+ if (likely(prev != next)) {
+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+ !PagePinned(virt_to_page(next->pgd)));
+
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#if 0 /* XEN: no lazy tlb */
+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
+ per_cpu(cpu_tlbstate, cpu).active_mm = next;
+#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+
+ /* Re-load page tables: load_cr3(next->pgd) */
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
+ op++;
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt)) {
+ /* load_LDT_nolock(&next->context, cpu) */
+ op->cmd = MMUEXT_SET_LDT;
+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
+ op->arg2.nr_ents = next->context.size;
+ op++;
+ }
+
+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+ }
+#if 0 /* XEN: no lazy tlb */
+ else {
+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
+ BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload %cr3.
+ */
+ load_cr3(next->pgd);
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
+
+#define deactivate_mm(tsk, mm) \
+ asm("movl %0,%%gs": :"r" (0));
+
+#define activate_mm(prev, next) \
+do { \
+ xen_activate_mm(prev, next); \
+ switch_mm((prev), (next), NULL); \
+} while (0)
+
+#endif
--- /dev/null
+#ifndef __X86_64_MMU_CONTEXT_H
+#define __X86_64_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+void arch_exit_mmap(struct mm_struct *mm);
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+/*
+ * possibly do the LDT unload here?
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+ if (read_pda(mmu_state) == TLBSTATE_OK)
+ write_pda(mmu_state, TLBSTATE_LAZY);
+#endif
+}
+
+#define prepare_arch_switch(next) __prepare_arch_switch()
+
+static inline void __prepare_arch_switch(void)
+{
+ /*
+ * Save away %es, %ds, %fs and %gs. Must happen before reload
+ * of cr3/ldt (i.e., not in __switch_to).
+ */
+ __asm__ __volatile__ (
+ "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
+ : "=m" (current->thread.es),
+ "=m" (current->thread.ds),
+ "=m" (current->thread.fsindex),
+ "=m" (current->thread.gsindex) );
+
+ if (current->thread.ds)
+ __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
+
+ if (current->thread.es)
+ __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
+
+ if (current->thread.fsindex) {
+ __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
+ current->thread.fs = 0;
+ }
+
+ if (current->thread.gsindex) {
+ load_gs_index(0);
+ current->thread.gs = 0;
+ }
+}
+
+extern void mm_pin(struct mm_struct *mm);
+extern void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+ struct mmuext_op _op[3], *op = _op;
+
+ if (likely(prev != next)) {
+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+ !PagePinned(virt_to_page(next->pgd)));
+
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+ write_pda(mmu_state, TLBSTATE_OK);
+ write_pda(active_mm, next);
+#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+
+ /* load_cr3(next->pgd) */
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
+ op++;
+
+ /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
+ op->cmd = MMUEXT_NEW_USER_BASEPTR;
+ op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
+ op++;
+
+ if (unlikely(next->context.ldt != prev->context.ldt)) {
+ /* load_LDT_nolock(&next->context) */
+ op->cmd = MMUEXT_SET_LDT;
+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
+ op->arg2.nr_ents = next->context.size;
+ op++;
+ }
+
+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+ }
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+ else {
+ write_pda(mmu_state, TLBSTATE_OK);
+ if (read_pda(active_mm) != next)
+ BUG();
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ */
+ load_cr3(next->pgd);
+ xen_new_user_pt(__pa(__user_pgd(next->pgd)));
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
+
+#define deactivate_mm(tsk, mm) \
+do { \
+ load_gs_index(0); \
+ asm volatile("movl %0,%%fs"::"r"(0)); \
+} while (0)
+
+static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+ if (!PagePinned(virt_to_page(next->pgd)))
+ mm_pin(next);
+ switch_mm(prev, next, NULL);
+}
+
+#endif
--- /dev/null
+#ifndef _ASM_X86_PAGE_H
+#define _ASM_X86_PAGE_H
+
+#include <linux/const.h>
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+#ifdef __KERNEL__
+
+/*
+ * Need to repeat this here in order to not include pgtable.h (which in turn
+ * depends on definitions made here), but to be able to use the symbolics
+ * below. The preprocessor will warn if the two definitions aren't identical.
+ */
+#define _PAGE_BIT_PRESENT 0
+#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
+#define _PAGE_BIT_IO 9
+#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
+
+#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
+
+/* Cast PAGE_MASK to a signed type so that it is sign-extended if
+ virtual addresses are 32-bits but physical addresses are larger
+ (ie, 32-bit PAE). */
+#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
+
+/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
+
+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+
+#define HPAGE_SHIFT PMD_SHIFT
+#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#endif
+
+#ifdef CONFIG_X86_64
+#include <asm/page_64.h>
+#else
+#include <asm/page_32.h>
+#endif /* CONFIG_X86_64 */
+
+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
+
+#define VM_DATA_DEFAULT_FLAGS \
+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+
+#ifndef __ASSEMBLY__
+
+extern int page_is_ram(unsigned long pagenr);
+extern int devmem_is_allowed(unsigned long pagenr);
+
+extern unsigned long max_pfn_mapped;
+
+struct page;
+
+static inline void clear_user_page(void *page, unsigned long vaddr,
+ struct page *pg)
+{
+ clear_page(page);
+}
+
+static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
+ struct page *topage)
+{
+ copy_page(to, from);
+}
+
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+typedef struct { pgprotval_t pgprot; } pgprot_t;
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+#include <asm/maddr.h>
+
+typedef struct { pgdval_t pgd; } pgd_t;
+
+#define __pgd_ma(x) ((pgd_t) { (x) } )
+static inline pgd_t xen_make_pgd(pgdval_t val)
+{
+ if (val & _PAGE_PRESENT)
+ val = pte_phys_to_machine(val);
+ return (pgd_t) { val };
+}
+
+#define __pgd_val(x) ((x).pgd)
+static inline pgdval_t xen_pgd_val(pgd_t pgd)
+{
+ pgdval_t ret = __pgd_val(pgd);
+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
+ if (ret)
+ ret = machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+ if (ret & _PAGE_PRESENT)
+ ret = pte_machine_to_phys(ret);
+#endif
+ return ret;
+}
+
+#if PAGETABLE_LEVELS >= 3
+#if PAGETABLE_LEVELS == 4
+typedef struct { pudval_t pud; } pud_t;
+
+#define __pud_ma(x) ((pud_t) { (x) } )
+static inline pud_t xen_make_pud(pudval_t val)
+{
+ if (val & _PAGE_PRESENT)
+ val = pte_phys_to_machine(val);
+ return (pud_t) { val };
+}
+
+#define __pud_val(x) ((x).pud)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+ pudval_t ret = __pud_val(pud);
+ if (ret & _PAGE_PRESENT)
+ ret = pte_machine_to_phys(ret);
+ return ret;
+}
+#else /* PAGETABLE_LEVELS == 3 */
+#include <asm-generic/pgtable-nopud.h>
+
+#define __pud_val(x) __pgd_val((x).pgd)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+ return xen_pgd_val(pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
+
+typedef struct { pmdval_t pmd; } pmd_t;
+
+#define __pmd_ma(x) ((pmd_t) { (x) } )
+static inline pmd_t xen_make_pmd(pmdval_t val)
+{
+ if (val & _PAGE_PRESENT)
+ val = pte_phys_to_machine(val);
+ return (pmd_t) { val };
+}
+
+#define __pmd_val(x) ((x).pmd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ pmdval_t ret = __pmd_val(pmd);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (ret)
+ ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+ if (ret & _PAGE_PRESENT)
+ ret = pte_machine_to_phys(ret);
+#endif
+ return ret;
+}
+#else /* PAGETABLE_LEVELS == 2 */
+#include <asm-generic/pgtable-nopmd.h>
+
+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
+#define __pmd_val(x) __pgd_val((x).pud.pgd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ return xen_pgd_val(pmd.pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
+static inline pte_t xen_make_pte(unsigned long long val)
+{
+ if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
+ val = pte_phys_to_machine(val);
+ return (pte_t) { .pte = val };
+}
+
+#define __pte_val(x) ((x).pte)
+static inline pteval_t xen_pte_val(pte_t pte)
+{
+ pteval_t ret = __pte_val(pte);
+ if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
+ ret = pte_machine_to_phys(ret);
+ return ret;
+}
+
+#define pgd_val(x) xen_pgd_val(x)
+#define __pgd(x) xen_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x) xen_pud_val(x)
+#define __pud(x) xen_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x) xen_pmd_val(x)
+#define __pmd(x) xen_make_pmd(x)
+#endif
+
+#define pte_val(x) xen_pte_val(x)
+#define __pte(x) xen_make_pte(x)
+
+#define __pa(x) __phys_addr((unsigned long)(x))
+/* __pa_symbol should be used for C visible symbols.
+ This seems to be the official gcc blessed way to do such arithmetic. */
+#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
+
+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+
+#define __boot_va(x) __va(x)
+#define __boot_pa(x) __pa(x)
+
+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#endif /* __ASSEMBLY__ */
+
+#include <asm-generic/memory_model.h>
+#include <asm-generic/page.h>
+
+#define __HAVE_ARCH_GATE_AREA 1
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_PAGE_H */
--- /dev/null
+#ifndef _X86_64_PAGE_H
+#define _X86_64_PAGE_H
+
+#define PAGETABLE_LEVELS 4
+
+#define THREAD_ORDER 1
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#define CURRENT_MASK (~(THREAD_SIZE - 1))
+
+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
+#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
+
+#define IRQSTACK_ORDER 2
+#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
+
+#define STACKFAULT_STACK 1
+#define DOUBLEFAULT_STACK 2
+#define NMI_STACK 3
+#define DEBUG_STACK 4
+#define MCE_STACK 5
+#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
+
+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+
+#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
+
+#define __PHYSICAL_START CONFIG_PHYSICAL_START
+#define __KERNEL_ALIGN 0x200000
+
+/*
+ * Make sure kernel is aligned to 2MB address. Catching it at compile
+ * time is better. Change your config file and compile the kernel
+ * for a 2MB aligned address (CONFIG_PHYSICAL_START)
+ */
+#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
+#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
+#endif
+
+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+#undef LOAD_OFFSET
+#define LOAD_OFFSET 0
+#endif
+
+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+#define __PHYSICAL_MASK_SHIFT 46
+#define __VIRTUAL_MASK_SHIFT 48
+
+/*
+ * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
+ * arch/x86/kernel/head_64.S), and it is mapped here:
+ */
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
+
+#ifndef __ASSEMBLY__
+void clear_page(void *page);
+void copy_page(void *to, void *from);
+
+extern unsigned long end_pfn;
+
+static inline unsigned long __phys_addr(unsigned long x)
+{
+ return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
+}
+
+#define __phys_reloc_hide(x) (x)
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+typedef unsigned long phys_addr_t;
+
+typedef struct page *pgtable_t;
+
+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
+
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif
+
+
+#endif /* _X86_64_PAGE_H */
--- /dev/null
+#ifndef __x86_PCI_H
+#define __x86_PCI_H
+
+#include <linux/mm.h> /* for struct page */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/scatterlist.h>
+#include <asm/io.h>
+
+#ifdef __KERNEL__
+
+struct pci_sysdata {
+ int domain; /* PCI domain */
+ int node; /* NUMA node */
+#ifdef CONFIG_X86_64
+ void *iommu; /* IOMMU private data */
+#endif
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+ struct pcifront_device *pdev;
+#endif
+};
+
+/* scan a bus after allocating a pci_sysdata for it */
+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+ int node);
+extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+
+static inline int pci_domain_nr(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->domain;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+ return pci_domain_nr(bus);
+}
+
+extern void pci_iommu_alloc(void);
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+ already-configured bus numbers - to be used for buggy BIOSes
+ or architectures with incomplete PCI setup by the loader */
+
+#ifdef CONFIG_PCI
+extern unsigned int pcibios_assign_all_busses(void);
+#else
+#define pcibios_assign_all_busses() 0
+#endif
+
+#include <asm/hypervisor.h>
+#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain())
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO 0x1000
+#define PCIBIOS_MIN_MEM (pci_mem_start)
+
+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+
+void pcibios_config_init(void);
+struct pci_bus *pcibios_scan_root(int bus);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq, int active);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+ enum pci_mmap_state mmap_state,
+ int write_combine);
+
+
+#ifdef CONFIG_PCI
+extern void early_quirks(void);
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+ enum pci_dma_burst_strategy *strat,
+ unsigned long *strategy_parameter)
+{
+ *strat = PCI_DMA_BURST_INFINITY;
+ *strategy_parameter = ~0UL;
+}
+#else
+static inline void early_quirks(void) { }
+#endif
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_32
+# include "pci_32.h"
+#else
+# include "pci_64.h"
+#endif
+
+/* implement the pci_ DMA API in terms of the generic device dma_ one */
+#include <asm-generic/pci-dma-compat.h>
+
+/* generic pci stuff */
+#include <asm-generic/pci.h>
+
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->node;
+}
+
+static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return node_to_cpumask(__pcibus_to_node(bus));
+}
+#endif
+
+#endif
--- /dev/null
+#ifndef __i386_PCI_H
+#define __i386_PCI_H
+
+
+#ifdef __KERNEL__
+
+/* Dynamic DMA mapping stuff.
+ * i386 has everything mapped statically.
+ */
+
+struct pci_dev;
+
+#ifdef CONFIG_SWIOTLB
+
+
+/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
+#define PCI_DMA_BUS_IS_PHYS (0)
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
+ dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
+ __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME) \
+ ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME) \
+ ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ (((PTR)->LEN_NAME) = (VAL))
+
+#else
+
+/* The PCI address space does equal the physical memory
+ * address space. The networking and block device layers use
+ * this boolean for bounce buffer decisions.
+ */
+#define PCI_DMA_BUS_IS_PHYS (1)
+
+/* pci_unmap_{page,single} is a nop so... */
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
+#define pci_unmap_len(PTR, LEN_NAME) (0)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
+
+#endif
+
+
+#endif /* __KERNEL__ */
+
+
+#endif /* __i386_PCI_H */
--- /dev/null
+#ifndef __x8664_PCI_H
+#define __x8664_PCI_H
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_CALGARY_IOMMU
+static inline void *pci_iommu(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->iommu;
+}
+
+static inline void set_pci_iommu(struct pci_bus *bus, void *val)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ sd->iommu = val;
+}
+#endif /* CONFIG_CALGARY_IOMMU */
+
+extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 *value);
+extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 value);
+
+extern void dma32_reserve_bootmem(void);
+
+/* The PCI address space does equal the physical memory
+ * address space. The networking and block device layers use
+ * this boolean for bounce buffer decisions
+ *
+ * On AMD64 it mostly equals, but we set it to zero if a hardware
+ * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
+ */
+#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
+
+#if defined(CONFIG_GART_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
+ dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
+ __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME) \
+ ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME) \
+ ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ (((PTR)->LEN_NAME) = (VAL))
+
+#elif defined(CONFIG_SWIOTLB)
+
+#define pci_dac_dma_supported(pci_dev, mask) 1
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
+ dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
+ __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME) \
+ ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME) \
+ ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ (((PTR)->LEN_NAME) = (VAL))
+
+#else
+/* No IOMMU */
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
+#define pci_unmap_len(PTR, LEN_NAME) (0)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
+
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* __x8664_PCI_H */
--- /dev/null
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h>
+
+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+
+#ifdef CONFIG_X86_64
+void early_make_page_readonly(void *va, unsigned int feature);
+pmd_t *early_get_pmd(unsigned long va);
+#define make_lowmem_page_readonly make_page_readonly
+#define make_lowmem_page_writable make_page_writable
+#endif
+
+/*
+ * Allocate and free page tables.
+ */
+extern void pgd_test_and_unpin(pgd_t *);
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+ done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+ make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
+ free_page((unsigned long)pte);
+}
+
+extern void __pte_free(pgtable_t);
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+ __pte_free(pte);
+}
+
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+ pmd_t *pmd, pte_t *pte)
+{
+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ struct page *pte)
+{
+ unsigned long pfn = page_to_pfn(pte);
+
+ paravirt_alloc_pte(mm, pfn);
+ if (PagePinned(virt_to_page(mm->pgd))) {
+ if (!PageHighMem(pte))
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+#ifndef CONFIG_X86_64
+ else if (!TestSetPagePinned(pte))
+ kmap_flush_unused();
+#endif
+ set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+ } else
+ *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+extern void __pmd_free(pgtable_t);
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ __pmd_free(virt_to_page(pmd));
+}
+
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else /* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)pmd,
+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
+ PAGE_KERNEL_RO), 0));
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+ } else
+ *pud = __pud(_PAGE_TABLE | __pa(pmd));
+}
+#endif /* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+
+/*
+ * We need to use the batch mode here, but pgd_pupulate() won't be
+ * be called frequently.
+ */
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)pud,
+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
+ PAGE_KERNEL_RO), 0));
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+ set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
+ } else {
+ *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
+ *__user_pgd(pgd) = *(pgd);
+ }
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pud_t *)pmd_alloc_one(mm, addr);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ __pmd_free(virt_to_page(pud));
+}
+
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#endif /* _ASM_X86_PGALLOC_H */
--- /dev/null
+#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
+#define _I386_PGTABLE_3LEVEL_DEFS_H
+
+#define SHARED_KERNEL_PMD 0
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 30
+#define PTRS_PER_PGD 4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
--- /dev/null
+#ifndef _I386_PGTABLE_3LEVEL_H
+#define _I386_PGTABLE_3LEVEL_H
+
+/*
+ * Intel Physical Address Extension (PAE) Mode - three-level page
+ * tables on PPro+ CPUs.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \
+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e) \
+ printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
+ __FILE__, __LINE__, &(e), __pmd_val(e), \
+ (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
+ __FILE__, __LINE__, &(e), __pgd_val(e), \
+ (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+
+static inline int pud_none(pud_t pud)
+{
+ return __pud_val(pud) == 0;
+
+}
+static inline int pud_bad(pud_t pud)
+{
+ return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+ return __pud_val(pud) & _PAGE_PRESENT;
+}
+
+/* Rules for using set_pte: the pte being assigned *must* be
+ * either not present or in a state where the hardware will
+ * not attempt to update the pte. In places where this is
+ * not possible, use pte_get_and_clear to obtain the old pte
+ * value and then use set_pte to update it. -ben
+ */
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_64bit((unsigned long long *)(ptep), __pte_val(pte));
+}
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ xen_l2_entry_update(pmdp, pmd);
+}
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+ xen_l3_entry_update(pudp, pud);
+}
+
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void __xen_pte_clear(pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = 0;
+}
+
+static inline void xen_pmd_clear(pmd_t *pmd)
+{
+ xen_l2_entry_update(pmd, __pmd(0));
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+ pgdval_t pgd;
+
+ set_pud(pudp, __pud(0));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ *
+ * Make sure the pud entry we're updating is within the
+ * current pgd to avoid unnecessary TLB flushes.
+ */
+ pgd = read_cr3();
+ if (__pa(pudp) >= pgd && __pa(pudp) <
+ (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
+ xen_tlb_flush();
+}
+
+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
+
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
+
+
+/* Find an entry in the second-level page table.. */
+#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
+ pmd_index(address))
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+ uint64_t val = __pte_val(res);
+ if (__cmpxchg64(ptep, val, 0) != val) {
+ /* xchg acts as a barrier before the setting of the high bits */
+ res.pte_low = xchg(&ptep->pte_low, 0);
+ res.pte_high = ptep->pte_high;
+ ptep->pte_high = 0;
+ }
+ return res;
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
+}
+
+#define pte_page(x) pfn_to_page(pte_pfn(x))
+
+static inline int pte_none(pte_t pte)
+{
+ return !(pte.pte_low | pte.pte_high);
+}
+
+#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
+ ((_pte).pte_high << (32-PAGE_SHIFT)))
+#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
+ __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
+#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \
+ (_pte).pte_low & _PAGE_PRESENT ? \
+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
+ __pte_mfn(_pte))
+
+/*
+ * Bits 0, 6 and 7 are taken in the low part of the pte,
+ * put the 32 bits of offset into the high part.
+ */
+#define pte_to_pgoff(pte) ((pte).pte_high)
+#define pgoff_to_pte(off) \
+ ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+#define PTE_FILE_MAX_BITS 32
+
+/* Encode and de-code a swap entry */
+#define __swp_type(x) (((x).val) & 0x1f)
+#define __swp_offset(x) ((x).val >> 5)
+#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
+#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
+#endif /* _I386_PGTABLE_3LEVEL_H */
--- /dev/null
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#define FIRST_USER_ADDRESS 0
+
+#define _PAGE_BIT_PRESENT 0 /* is present */
+#define _PAGE_BIT_RW 1 /* writeable */
+#define _PAGE_BIT_USER 2 /* userspace addressable */
+#define _PAGE_BIT_PWT 3 /* page write through */
+#define _PAGE_BIT_PCD 4 /* page cache disabled */
+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_FILE 6
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
+ * has no associated page struct. */
+#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
+#define _PAGE_BIT_UNUSED3 11
+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
+/*
+ * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
+ * sign-extended value on 32-bit with all 1's in the upper word,
+ * which preserves the upper pte values on 64-bit ptes:
+ */
+#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
+#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
+#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
+#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
+#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
+#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
+#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
+#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
+#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
+#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
+#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX 0
+#endif
+
+/* If _PAGE_PRESENT is clear, we use these: */
+#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping,
+ * saved PTE; unset:swap */
+#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
+ pte_present gives true */
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+extern unsigned int __kernel_page_user;
+#else
+#define __kernel_page_user 0
+#endif
+#endif
+
+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY | __kernel_page_user)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+
+/*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+ */
+#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
+#define _PAGE_CACHE_WB (0)
+#define _PAGE_CACHE_WT (_PAGE_PWT)
+#define _PAGE_CACHE_WC (_PAGE_PAT)
+#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
+#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+#define PAGE_COPY PAGE_COPY_NOEXEC
+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+
+#ifdef CONFIG_X86_32
+#define _PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#ifndef __ASSEMBLY__
+extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
+#endif /* __ASSEMBLY__ */
+#else
+#define __PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
+#endif
+
+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+/*
+ * We don't support GLOBAL page in xenolinux64
+ */
+#define MAKE_GLOBAL(x) __pgprot((x))
+
+#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+
+/* xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+#ifndef __ASSEMBLY__
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)
+{
+ return __pte_val(pte) & _PAGE_DIRTY;
+}
+
+static inline int pte_young(pte_t pte)
+{
+ return __pte_val(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+ return __pte_val(pte) & _PAGE_RW;
+}
+
+static inline int pte_file(pte_t pte)
+{
+ return __pte_val(pte) & _PAGE_FILE;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+ return __pte_val(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+ return __pte_val(pte) & _PAGE_GLOBAL;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+ return !(__pte_val(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+ return 0;
+}
+
+static inline int pmd_large(pmd_t pte)
+{
+ return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) | _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) | _PAGE_GLOBAL);
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_GLOBAL);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return pte;
+}
+
+extern pteval_t __supported_pte_mask;
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
+ pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
+ pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
+ pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+ pteval_t val = pte_val(pte);
+
+ val &= _PAGE_CHG_MASK;
+ val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+
+ return __pte(val);
+}
+
+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+ pgprotval_t addbits = pgprot_val(newprot);
+ return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
+
+#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+
+#ifndef __ASSEMBLY__
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot);
+#endif
+
+#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_atomic(ptep, pte) \
+ xen_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd) xen_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud) xen_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd) xen_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+# include "pgtable_32.h"
+#else
+# include "pgtable_64.h"
+#endif
+
+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
+#ifndef __ASSEMBLY__
+
+enum {
+ PG_LEVEL_NONE,
+ PG_LEVEL_4K,
+ PG_LEVEL_2M,
+ PG_LEVEL_1G,
+};
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+ xen_set_pte(ptep, __pte(0));
+ return res;
+}
+
+static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep , pte_t pte)
+{
+ if ((mm != current->mm && mm != &init_mm) ||
+ HYPERVISOR_update_va_mapping(addr, pte, 0))
+ xen_set_pte(ptep, pte);
+}
+
+static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ if ((mm != current->mm && mm != &init_mm)
+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+ __xen_pte_clear(ptep);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces. It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush. The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+#define ptep_clear_flush(vma, addr, ptep) \
+({ \
+ pte_t *__ptep = (ptep); \
+ pte_t __res = *__ptep; \
+ if (!pte_none(__res) && \
+ ((vma)->vm_mm != current->mm || \
+ HYPERVISOR_update_va_mapping(addr, __pte(0), \
+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
+ UVMF_INVLPG|UVMF_MULTI))) { \
+ __xen_pte_clear(__ptep); \
+ flush_tlb_page(vma, addr); \
+ } \
+ __res; \
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pte_t pte = *ptep;
+ if (!pte_none(pte)
+ && (mm != &init_mm
+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
+ pte = xen_ptep_get_and_clear(ptep, pte);
+ pte_update(mm, addr, ptep);
+ }
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+#define ptep_get_and_clear_full(mm, addr, ptep, full) \
+ ((full) ? ({ \
+ pte_t *__ptep = (ptep); \
+ pte_t __res = *__ptep; \
+ if (!PagePinned(virt_to_page((mm)->pgd))) \
+ __xen_pte_clear(__ptep); \
+ else if (!pte_none(__res)) \
+ xen_l1_entry_update(__ptep, __pte(0)); \
+ __res; \
+ }) : \
+ ptep_get_and_clear(mm, addr, ptep))
+
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t pte = *ptep;
+ if (pte_write(pte))
+ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+}
+
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ * dst - pointer to pgd range anwhere on a pgd page
+ * src - ""
+ * count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+ memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
+ xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
+
+#define arbitrary_virt_to_machine(va) \
+({ \
+ unsigned int __lvl; \
+ pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
+ BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
+ (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \
+ | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
+})
+
+#include <asm-generic/pgtable.h>
+
+#include <xen/features.h>
+void make_page_readonly(void *va, unsigned int feature);
+void make_page_writable(void *va, unsigned int feature);
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
+
+struct vm_area_struct;
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+ unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid);
+int direct_kernel_remap_pfn_range(unsigned long address,
+ unsigned long mfn,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid);
+int create_lookup_pte_addr(struct mm_struct *mm,
+ unsigned long address,
+ uint64_t *ptep);
+int touch_pte_range(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long size);
+
+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ int dirty_accountable);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
--- /dev/null
+#ifndef _I386_PGTABLE_H
+#define _I386_PGTABLE_H
+
+#include <asm/hypervisor.h>
+
+/*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+ * table, so that we physically have the same two-level page table as the
+ * i386 mmu expects.
+ *
+ * This file contains the functions and defines necessary to modify and use
+ * the i386 page table tree.
+ */
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <linux/threads.h>
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+struct vm_area_struct;
+
+extern pgd_t *swapper_pg_dir;
+
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
+void paging_init(void);
+
+
+/*
+ * The Linux x86 paging architecture is 'compile-time dual-mode', it
+ * implements both the traditional 2-level x86 page tables and the
+ * newer 3-level PAE-mode page tables.
+ */
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level-defs.h>
+# define PMD_SIZE (1UL << PMD_SHIFT)
+# define PMD_MASK (~(PMD_SIZE - 1))
+#else
+# include <asm/pgtable-2level-defs.h>
+#endif
+
+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+/* Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+ * physical memory until the kernel virtual memory starts. That means that
+ * any out-of-bounds memory accesses will hopefully be caught.
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ */
+#define VMALLOC_OFFSET (8 * 1024 * 1024)
+#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \
+ & ~(VMALLOC_OFFSET - 1))
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
+ & PMD_MASK)
+
+#ifdef CONFIG_HIGHMEM
+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
+#else
+# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
+#endif
+
+/*
+ * Define this if things work differently on an i386 and an i486:
+ * it will (on an i486) warn about kernel memory accesses that are
+ * done without a 'access_ok(VERIFY_WRITE,..)'
+ */
+#undef TEST_ACCESS_OK
+
+/* The boot page tables (all created as a single array) */
+extern unsigned long pg0[];
+
+#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
+
+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+#define pmd_none(x) (!(unsigned long)__pmd_val(x))
+#if CONFIG_XEN_COMPAT <= 0x030002
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+ can temporarily clear it. */
+#define pmd_present(x) (__pmd_val(x))
+#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
+#else
+#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
+#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+#endif
+
+
+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
+
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level.h>
+#else
+# include <asm/pgtable-2level.h>
+#endif
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ * On processors which do not support it, this is a no-op.
+ */
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
+ : (prot))
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+#define pgd_index_k(addr) pgd_index((addr))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+static inline int pud_large(pud_t pud) { return 0; }
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+#define pmd_index(address) \
+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this macro returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+#define pte_index(address) \
+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address) \
+ ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
+
+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
+
+#define pmd_page_vaddr(pmd) \
+ ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
+ pte_index((address)))
+#define pte_offset_map_nested(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
+ pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
+#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#else
+#define pte_offset_map(dir, address) \
+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { } while (0)
+#endif
+
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr) \
+do { \
+ if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
+ BUG(); \
+} while (0)
+
+/*
+ * The i386 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+#define update_mmu_cache(vma, address, pte) do { } while (0)
+
+void make_lowmem_page_readonly(void *va, unsigned int feature);
+void make_lowmem_page_writable(void *va, unsigned int feature);
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
+#ifdef CONFIG_FLATMEM
+#define kern_addr_valid(addr) (1)
+#else
+#define kern_addr_valid(kaddr) (0)
+#endif
+
+#define io_remap_pfn_range(vma, from, pfn, size, prot) \
+ direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
+
+#endif /* _I386_PGTABLE_H */
--- /dev/null
+#ifndef _X86_64_PGTABLE_H
+#define _X86_64_PGTABLE_H
+
+#include <linux/const.h>
+#ifndef __ASSEMBLY__
+
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the x86-64 page table tree.
+ */
+#include <asm/processor.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/sched.h>
+#include <asm/pda.h>
+#ifdef CONFIG_XEN
+#include <asm/hypervisor.h>
+
+extern pud_t level3_user_pgt[512];
+
+extern void xen_init_pt(void);
+#endif
+
+extern pud_t level3_kernel_pgt[512];
+extern pud_t level3_ident_pgt[512];
+extern pmd_t level2_kernel_pgt[512];
+extern pgd_t init_level4_pgt[];
+
+#define swapper_pg_dir init_level4_pgt
+
+extern void paging_init(void);
+
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 39
+#define PTRS_PER_PGD 512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT 30
+#define PTRS_PER_PUD 512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#ifndef __ASSEMBLY__
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e) \
+ printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
+ __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
+#define pud_ERROR(e) \
+ printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \
+ __FILE__, __LINE__, &(e), __pud_val(e), \
+ (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \
+ __FILE__, __LINE__, &(e), __pgd_val(e), \
+ (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+#define pgd_none(x) (!__pgd_val(x))
+#define pud_none(x) (!__pud_val(x))
+
+struct mm_struct;
+
+#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ *ptep = pte;
+}
+
+static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ xen_set_pte(ptep, pte);
+}
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
+{
+ return __pte_ma(xchg(&xp->pte, 0));
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ xen_l2_entry_update(pmdp, pmd);
+}
+
+static inline void xen_pmd_clear(pmd_t *pmd)
+{
+ xen_set_pmd(pmd, xen_make_pmd(0));
+}
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+ xen_l3_entry_update(pudp, pud);
+}
+
+static inline void xen_pud_clear(pud_t *pud)
+{
+ xen_set_pud(pud, xen_make_pud(0));
+}
+
+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+
+static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ xen_l4_entry_update(pgdp, pgd);
+}
+
+static inline void xen_pgd_clear(pgd_t *pgd)
+{
+ xen_set_pgd(pgd, xen_make_pgd(0));
+ xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
+}
+
+#define pte_same(a, b) ((a).pte == (b).pte)
+
+#endif /* !__ASSEMBLY__ */
+
+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE - 1))
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+
+#define MAXMEM _AC(0x00003fffffffffff, UL)
+#define VMALLOC_START _AC(0xffffc20000000000, UL)
+#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffe20000000000, UL)
+#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
+#define MODULES_END _AC(0xfffffffffff00000, UL)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#ifndef __ASSEMBLY__
+
+static inline int pgd_bad(pgd_t pgd)
+{
+ return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+ return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+#define pte_none(x) (!(x).pte)
+#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
+
+#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
+
+#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
+#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
+ __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
+#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
+ (_pte).pte & _PAGE_PRESENT ? \
+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
+ __pte_mfn(_pte))
+
+#define pte_page(x) pfn_to_page(pte_pfn((x)))
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+#define pgprot_noncached(prot) \
+ (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+
+/*
+ * Level 4 access.
+ */
+#define pgd_page_vaddr(pgd) \
+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
+#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
+#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
+static inline int pgd_large(pgd_t pgd) { return 0; }
+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+/* PUD - Level3 access */
+/* to find an entry in a page-table-directory. */
+#define pud_page_vaddr(pud) \
+ ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+#define pud_offset(pgd, address) \
+ ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
+#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
+
+static inline int pud_large(pud_t pte)
+{
+ return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+/* PMD - Level 2 access */
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
+
+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
+ pmd_index(address))
+#define pmd_none(x) (!__pmd_val(x))
+#if CONFIG_XEN_COMPAT <= 0x030002
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+ can temporarily clear it. */
+#define pmd_present(x) (__pmd_val(x))
+#else
+#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
+#endif
+#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
+#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
+ _PAGE_FILE })
+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+
+/* PTE - Level 1 access. */
+
+/* page, protection -> pte */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
+
+#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
+ pte_index((address)))
+
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) /* NOP */
+#define pte_unmap_nested(pte) /* NOP */
+
+#define update_mmu_cache(vma, address, pte) do { } while (0)
+
+extern int direct_gbpages;
+
+/* Encode and de-code a swap entry */
+#define __swp_type(x) (((x).val >> 1) & 0x3f)
+#define __swp_offset(x) ((x).val >> 8)
+#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \
+ ((offset) << 8) })
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+extern int kern_addr_valid(unsigned long addr);
+extern void cleanup_highmap(void);
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+#define pgtable_cache_init() do { } while (0)
+#define check_pgt_cache() do { } while (0)
+
+#define PAGE_AGP PAGE_KERNEL_NOCACHE
+#define HAVE_PAGE_AGP 1
+
+/* fs/proc/kcore.c */
+#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+#define kc_offset_to_vaddr(o) \
+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
+ ? ((o) | ~__VIRTUAL_MASK) \
+ : (o))
+
+#define __HAVE_ARCH_PTE_SAME
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _X86_64_PGTABLE_H */
--- /dev/null
+#ifndef __ASM_X86_PROCESSOR_H
+#define __ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <xen/interface/physdev.h>
+
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+ void *pc;
+
+ asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
+ return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
+#else
+# define ARCH_MIN_TASKALIGN 16
+# define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+/*
+ * CPU type and hardware bug flags. Kept separately for each CPU.
+ * Members of this structure are referenced in head.S, so think twice
+ * before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+ __u8 x86; /* CPU family */
+ __u8 x86_vendor; /* CPU vendor */
+ __u8 x86_model;
+ __u8 x86_mask;
+#ifdef CONFIG_X86_32
+ char wp_works_ok; /* It doesn't on 386's */
+
+ /* Problems on some 486Dx4's and old 386's: */
+ char hlt_works_ok;
+ char hard_math;
+ char rfu;
+ char fdiv_bug;
+ char f00f_bug;
+ char coma_bug;
+ char pad0;
+#else
+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+ int x86_tlbsize;
+ __u8 x86_virt_bits;
+ __u8 x86_phys_bits;
+ /* CPUID returned core id bits: */
+ __u8 x86_coreid_bits;
+ /* Max extended CPUID function supported: */
+ __u32 extended_cpuid_level;
+#endif
+ /* Maximum supported CPUID level, -1=no CPUID: */
+ int cpuid_level;
+ __u32 x86_capability[NCAPINTS];
+ char x86_vendor_id[16];
+ char x86_model_id[64];
+ /* in KB - valid for CPUS which support this call: */
+ int x86_cache_size;
+ int x86_cache_alignment; /* In bytes */
+ int x86_power;
+ unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+ /* cpus sharing the last level cache: */
+ cpumask_t llc_shared_map;
+#endif
+ /* cpuid returned max cores value: */
+ u16 x86_max_cores;
+ u16 apicid;
+ u16 initial_apicid;
+ u16 x86_clflush_size;
+#ifdef CONFIG_SMP
+ /* number of cores as seen by the OS: */
+ u16 booted_cores;
+ /* Physical processor id: */
+ u16 phys_proc_id;
+ /* Core id: */
+ u16 cpu_core_id;
+ /* Index into per_cpu list: */
+ u16 cpu_index;
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+
+#define X86_VENDOR_UNKNOWN 0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+
+extern __u32 cleared_cpu_caps[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu) per_cpu(cpu_info, cpu)
+#define current_cpu_data cpu_data(smp_processor_id())
+#else
+#define cpu_data(cpu) boot_cpu_data
+#define current_cpu_data boot_cpu_data
+#endif
+
+static inline int hlt_works(int cpu)
+{
+#ifdef CONFIG_X86_32
+ return cpu_data(cpu).hlt_works_ok;
+#else
+ return 1;
+#endif
+}
+
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);
+
+extern void identify_cpu(struct cpuinfo_x86 *);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
+extern void detect_ht(struct cpuinfo_x86 *c);
+#else
+static inline void detect_ht(struct cpuinfo_x86 *c) {}
+#endif
+
+static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm(XEN_CPUID
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+ write_cr3(__pa(pgdir));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+ unsigned short back_link, __blh;
+ unsigned long sp0;
+ unsigned short ss0, __ss0h;
+ unsigned long sp1;
+ /* ss1 caches MSR_IA32_SYSENTER_CS: */
+ unsigned short ss1, __ss1h;
+ unsigned long sp2;
+ unsigned short ss2, __ss2h;
+ unsigned long __cr3;
+ unsigned long ip;
+ unsigned long flags;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long bx;
+ unsigned long sp;
+ unsigned long bp;
+ unsigned long si;
+ unsigned long di;
+ unsigned short es, __esh;
+ unsigned short cs, __csh;
+ unsigned short ss, __ssh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+ unsigned short ldt, __ldth;
+ unsigned short trace;
+ unsigned short io_bitmap_base;
+
+} __attribute__((packed));
+extern struct tss_struct doublefault_tss;
+#else
+struct x86_hw_tss {
+ u32 reserved1;
+ u64 sp0;
+ u64 sp1;
+ u64 sp2;
+ u64 reserved2;
+ u64 ist[7];
+ u32 reserved3;
+ u32 reserved4;
+ u16 reserved5;
+ u16 io_bitmap_base;
+
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+#endif /* CONFIG_X86_NO_TSS */
+
+/*
+ * IO-bitmap sizes:
+ */
+#define IO_BITMAP_BITS 65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+
+#ifndef CONFIG_X86_NO_TSS
+struct tss_struct {
+ /*
+ * The hardware state:
+ */
+ struct x86_hw_tss x86_tss;
+
+ /*
+ * The extra 1 is there because the CPU will access an
+ * additional byte beyond the end of the IO permission
+ * bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+ /*
+ * Cache the current maximum and the last task that used the bitmap:
+ */
+ unsigned long io_bitmap_max;
+ struct thread_struct *io_bitmap_owner;
+
+ /*
+ * Pad the TSS to be cacheline-aligned (size is 0x100):
+ */
+ unsigned long __cacheline_filler[35];
+ /*
+ * .. and then another 0x100 bytes for the emergency kernel stack:
+ */
+ unsigned long stack[64];
+
+} __attribute__((packed));
+
+DECLARE_PER_CPU(struct tss_struct, init_tss);
+
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
+struct orig_ist {
+ unsigned long ist[7];
+};
+#endif /* CONFIG_X86_NO_TSS */
+
+#define MXCSR_DEFAULT 0x1f80
+
+struct i387_fsave_struct {
+ u32 cwd; /* FPU Control Word */
+ u32 swd; /* FPU Status Word */
+ u32 twd; /* FPU Tag Word */
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Pointer Offset */
+ u32 fos; /* FPU Operand Pointer Selector */
+
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+
+ /* Software status information [not touched by FSAVE ]: */
+ u32 status;
+};
+
+struct i387_fxsave_struct {
+ u16 cwd; /* Control Word */
+ u16 swd; /* Status Word */
+ u16 twd; /* Tag Word */
+ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+ u64 rip; /* Instruction Pointer */
+ u64 rdp; /* Data Pointer */
+ };
+ struct {
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Offset */
+ u32 fos; /* FPU Operand Selector */
+ };
+ };
+ u32 mxcsr; /* MXCSR Register State */
+ u32 mxcsr_mask; /* MXCSR Mask */
+
+ /* 8*16 bytes for each FP-reg = 128 bytes: */
+ u32 st_space[32];
+
+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
+ u32 xmm_space[64];
+
+ u32 padding[24];
+
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+ u8 ftop;
+ u8 changed;
+ u8 lookahead;
+ u8 no_update;
+ u8 rm;
+ u8 alimit;
+ struct info *info;
+ u32 entry_eip;
+};
+
+union thread_xstate {
+ struct i387_fsave_struct fsave;
+ struct i387_fxsave_struct fxsave;
+ struct i387_soft_struct soft;
+};
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern unsigned int xstate_size;
+extern void free_thread_xstate(struct task_struct *);
+extern struct kmem_cache *task_xstate_cachep;
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+struct thread_struct {
+ /* Cached TLS descriptors: */
+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+ unsigned long sp0;
+ unsigned long sp;
+#ifdef CONFIG_X86_32
+ unsigned long sysenter_cs;
+#else
+ unsigned long usersp; /* Copy from PDA */
+ unsigned short es;
+ unsigned short ds;
+ unsigned short fsindex;
+ unsigned short gsindex;
+#endif
+ unsigned long ip;
+ unsigned long fs;
+ unsigned long gs;
+ /* Hardware debugging registers: */
+ unsigned long debugreg0;
+ unsigned long debugreg1;
+ unsigned long debugreg2;
+ unsigned long debugreg3;
+ unsigned long debugreg6;
+ unsigned long debugreg7;
+ /* Fault info: */
+ unsigned long cr2;
+ unsigned long trap_no;
+ unsigned long error_code;
+ /* floating point and extended processor state */
+ union thread_xstate *xstate;
+#ifdef CONFIG_X86_32
+ /* Virtual 86 mode info */
+ struct vm86_struct __user *vm86_info;
+ unsigned long screen_bitmap;
+ unsigned long v86flags, v86mask, saved_sp0;
+ unsigned int saved_fs, saved_gs;
+#endif
+ /* IO permissions: */
+ unsigned long *io_bitmap_ptr;
+ unsigned long iopl;
+ /* Max allowed port in the bitmap, in bytes: */
+ unsigned io_bitmap_max;
+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
+ unsigned long debugctlmsr;
+/* Debug Store - if not 0 points to a DS Save Area configuration;
+ * goes into MSR_IA32_DS_AREA */
+ unsigned long ds_area_msr;
+};
+
+static inline unsigned long xen_get_debugreg(int regno)
+{
+ return HYPERVISOR_get_debugreg(regno);
+}
+
+static inline void xen_set_debugreg(int regno, unsigned long value)
+{
+ WARN_ON(HYPERVISOR_set_debugreg(regno, value));
+}
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void xen_set_iopl_mask(unsigned mask)
+{
+ struct physdev_set_iopl set_iopl;
+
+ /* Force the change at ring 0. */
+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+static inline void
+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+{
+ tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+ tss->x86_tss.ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+#endif
+}
+#else
+#define xen_load_sp0(tss, thread) do { \
+ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
+ BUG(); \
+} while (0)
+#endif
+
+#define __cpuid xen_cpuid
+#define paravirt_enabled() 0
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ (var) = xen_get_debugreg(register)
+#define set_debugreg(value, register) \
+ xen_set_debugreg(register, value)
+
+#define load_sp0 xen_load_sp0
+
+#define set_iopl_mask xen_set_iopl_mask
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long mmu_cr4_features;
+
+static inline void set_in_cr4(unsigned long mask)
+{
+ unsigned cr4;
+
+ mmu_cr4_features |= mask;
+ cr4 = read_cr4();
+ cr4 |= mask;
+ write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+ unsigned cr4;
+
+ mmu_cr4_features &= ~mask;
+ cr4 = read_cr4();
+ cr4 &= ~mask;
+ write_cr4(cr4);
+}
+
+struct microcode_header {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int reserved[3];
+};
+
+struct microcode {
+ struct microcode_header hdr;
+ unsigned int bits[0];
+};
+
+typedef struct microcode microcode_t;
+typedef struct microcode_header microcode_header_t;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[0];
+};
+
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+
+/*
+ * create a kernel thread without removing it from tasklists
+ */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+/* Prepare to copy thread state - unlazy all lazy state */
+extern void prepare_to_copy(struct task_struct *tsk);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void cpu_relax(void)
+{
+ rep_nop();
+}
+
+/* Stop speculative execution: */
+static inline void sync_core(void)
+{
+ int tmp;
+
+ asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+ : "ebx", "ecx", "edx", "memory");
+}
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+ /* "monitor %eax, %ecx, %edx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc8;"
+ :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+ /* "mwait %eax, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+ trace_hardirqs_on();
+ /* "mwait %eax, %ecx;" */
+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
+extern int force_mwait;
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+
+extern unsigned long boot_option_idle_override;
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(void);
+extern void cpu_init(void);
+extern void init_gdt(int cpu);
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
+/*
+ * from system description table in BIOS. Mostly for MCA use, but
+ * others may find it useful:
+ */
+extern unsigned int machine_id;
+extern unsigned int machine_submodel_id;
+extern unsigned int BIOS_revision;
+
+/* Boot loader type from the setup header: */
+extern int bootloader_type;
+
+extern char ignore_fpu_irq;
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+# define BASE_PREFETCH ASM_NOP4
+# define ARCH_HAS_PREFETCH
+#else
+# define BASE_PREFETCH "prefetcht0 (%1)"
+#endif
+
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
+static inline void prefetch(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchnta (%1)",
+ X86_FEATURE_XMM,
+ "r" (x));
+}
+
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
+static inline void prefetchw(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchw (%1)",
+ X86_FEATURE_3DNOW,
+ "r" (x));
+}
+
+static inline void spin_lock_prefetch(const void *x)
+{
+ prefetchw(x);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE PAGE_OFFSET
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX STACK_TOP
+
+#define INIT_THREAD { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .vm86_info = NULL, \
+ .sysenter_cs = __KERNEL_CS, \
+ .io_bitmap_ptr = NULL, \
+ .fs = __KERNEL_PERCPU, \
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS { \
+ .x86_tss = { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .ss0 = __KERNEL_DS, \
+ .ss1 = __KERNEL_CS, \
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
+ }, \
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
+}
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info) \
+({ \
+ unsigned long *__ptr = (unsigned long *)(info); \
+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessable even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task) \
+({ \
+ struct pt_regs *__regs__; \
+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+ __regs__ - 1; \
+})
+
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
+#else
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+ 0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE64)
+
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX TASK_SIZE64
+
+#define INIT_THREAD { \
+ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS { \
+ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+#endif /* CONFIG_X86_64 */
+
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
+#define SET_TSC_CTL(val) set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
+
+#endif
--- /dev/null
+#ifndef _ASM_X86_SEGMENT_H_
+#define _ASM_X86_SEGMENT_H_
+
+/* Simple and small GDT entries for booting only */
+
+#define GDT_ENTRY_BOOT_CS 2
+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
+
+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
+
+#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
+
+#ifdef CONFIG_X86_32
+/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ * 0 - null
+ * 1 - reserved
+ * 2 - reserved
+ * 3 - reserved
+ *
+ * 4 - unused <==== new cacheline
+ * 5 - unused
+ *
+ * ------- start of TLS (Thread-Local Storage) segments:
+ *
+ * 6 - TLS segment #1 [ glibc's TLS segment ]
+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
+ * 8 - TLS segment #3
+ * 9 - reserved
+ * 10 - reserved
+ * 11 - reserved
+ *
+ * ------- start of kernel segments:
+ *
+ * 12 - kernel code segment <==== new cacheline
+ * 13 - kernel data segment
+ * 14 - default user CS
+ * 15 - default user DS
+ * 16 - TSS
+ * 17 - LDT
+ * 18 - PNPBIOS support (16->32 gate)
+ * 19 - PNPBIOS support
+ * 20 - PNPBIOS support
+ * 21 - PNPBIOS support
+ * 22 - PNPBIOS support
+ * 23 - APM BIOS support
+ * 24 - APM BIOS support
+ * 25 - APM BIOS support
+ *
+ * 26 - ESPFIX small SS
+ * 27 - per-cpu [ offset to per-cpu data area ]
+ * 28 - unused
+ * 29 - unused
+ * 30 - unused
+ * 31 - TSS for double fault handler
+ */
+#define GDT_ENTRY_TLS_MIN 6
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+
+#define GDT_ENTRY_DEFAULT_USER_CS 14
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
+
+#define GDT_ENTRY_DEFAULT_USER_DS 15
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
+
+#define GDT_ENTRY_KERNEL_BASE 12
+
+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
+
+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
+
+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+
+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+
+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
+#ifdef CONFIG_SMP
+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+#else
+#define __KERNEL_PERCPU 0
+#endif
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+
+/*
+ * The GDT has 32 entries
+ */
+#define GDT_ENTRIES 32
+
+/* The PnP BIOS entries in the GDT */
+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
+
+/* The PnP BIOS selectors */
+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/*
+ * Matching rules for certain types of segments.
+ */
+
+/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
+#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
+ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
+
+/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
+#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
+ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
+ || ((x) & ~3) == (FLAT_USER_CS & ~3))
+
+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
+#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
+
+#else
+#include <asm/cache.h>
+
+#define __KERNEL_CS 0x10
+#define __KERNEL_DS 0x18
+
+#define __KERNEL32_CS 0x08
+
+/*
+ * we cannot use the same code segment descriptor for user and kernel
+ * -- not even in the long flat mode, because of different DPL /kkeil
+ * The segment offset needs to contain a RPL. Grr. -AK
+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ */
+
+#define __USER32_CS 0x23 /* 4*8+3 */
+#define __USER_DS 0x2b /* 5*8+3 */
+#define __USER_CS 0x33 /* 6*8+3 */
+#define __USER32_DS __USER_DS
+
+#define GDT_ENTRY_TSS 8 /* needs two entries */
+#define GDT_ENTRY_LDT 10 /* needs two entries */
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+
+#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
+
+/* TLS indexes for 64bit - hardcoded in arch_prctl */
+#define FS_TLS 0
+#define GS_TLS 1
+
+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+
+#define GDT_ENTRIES 16
+
+#endif
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+#define IDT_ENTRIES 256
+#define NUM_EXCEPTION_VECTORS 32
+#define GDT_SIZE (GDT_ENTRIES * 8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
+#endif
+#endif
+
+#endif
--- /dev/null
+#ifndef _ASM_X86_SMP_H_
+#define _ASM_X86_SMP_H_
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+# include <asm/io_apic.h>
+# endif
+#endif
+#include <asm/pda.h>
+#include <linux/thread_info.h>
+
+#define cpu_callout_map cpu_possible_map
+extern cpumask_t cpu_initialized;
+#define cpu_callin_map cpu_possible_map
+
+extern void (*mtrr_hook)(void);
+extern void zap_low_mappings(void);
+
+extern int smp_num_siblings;
+extern unsigned int num_processors;
+extern cpumask_t cpu_initialized;
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+extern u16 x86_cpu_to_apicid_init[];
+extern u16 x86_bios_cpu_apicid_init[];
+extern void *x86_cpu_to_apicid_early_ptr;
+extern void *x86_bios_cpu_apicid_early_ptr;
+#else
+#define x86_cpu_to_apicid_early_ptr NULL
+#define x86_bios_cpu_apicid_early_ptr NULL
+#endif
+
+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+
+#ifdef CONFIG_SMP
+
+#ifndef CONFIG_XEN
+
+/* Static state in head.S used to set up a CPU */
+extern struct {
+ void *sp;
+ unsigned short ss;
+} stack_start;
+
+struct smp_ops {
+ void (*smp_prepare_boot_cpu)(void);
+ void (*smp_prepare_cpus)(unsigned max_cpus);
+ int (*cpu_up)(unsigned cpu);
+ void (*smp_cpus_done)(unsigned max_cpus);
+
+ void (*smp_send_stop)(void);
+ void (*smp_send_reschedule)(int cpu);
+ int (*smp_call_function_mask)(cpumask_t mask,
+ void (*func)(void *info), void *info,
+ int wait);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+#ifndef CONFIG_PARAVIRT
+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
+#endif
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+ smp_ops.smp_send_stop();
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+ smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+ smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu)
+{
+ return smp_ops.cpu_up(cpu);
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+ smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline int smp_call_function_mask(cpumask_t mask,
+ void (*func) (void *info), void *info,
+ int wait)
+{
+ return smp_ops.smp_call_function_mask(mask, func, info, wait);
+}
+
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void native_smp_cpus_done(unsigned int max_cpus);
+int native_cpu_up(unsigned int cpunum);
+
+#else /* CONFIG_XEN */
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function_mask(cpumask_t mask,
+ void (*func) (void *info), void *info,
+ int wait);
+
+#define smp_send_stop xen_smp_send_stop
+#define smp_send_reschedule xen_smp_send_reschedule
+#define smp_call_function_mask xen_smp_call_function_mask
+
+extern void prefill_possible_map(void);
+
+#endif /* CONFIG_XEN */
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+
+extern void prefill_possible_map(void);
+
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+
+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
+static inline int num_booting_cpus(void)
+{
+ return cpus_weight(cpu_callout_map);
+}
+#endif /* CONFIG_SMP */
+
+extern unsigned disabled_cpus __cpuinitdata;
+
+#ifdef CONFIG_X86_32_SMP
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+DECLARE_PER_CPU(int, cpu_number);
+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define safe_smp_processor_id() smp_processor_id()
+
+#elif defined(CONFIG_X86_64_SMP)
+#define raw_smp_processor_id() read_pda(cpunumber)
+
+#define stack_smp_processor_id() \
+({ \
+ struct thread_info *ti; \
+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+ ti->cpu; \
+})
+#define safe_smp_processor_id() smp_processor_id()
+
+#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
+#define safe_smp_processor_id() 0
+#define stack_smp_processor_id() 0
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+
+static inline int logical_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+#ifndef CONFIG_X86_64
+static inline unsigned int read_apic_id(void)
+{
+ return *(u32 *)(APIC_BASE + APIC_ID);
+}
+#else
+extern unsigned int read_apic_id(void);
+#endif
+
+
+# ifdef APIC_DEFINITION
+extern int hard_smp_processor_id(void);
+# else
+# include <mach_apicdef.h>
+static inline int hard_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return GET_APIC_ID(read_apic_id());
+}
+# endif /* APIC_DEFINITION */
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+# ifndef CONFIG_SMP
+# define hard_smp_processor_id() 0
+# endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern void cpu_exit_clear(void);
+extern void cpu_uninit(void);
+#endif
+
+extern void smp_alloc_memory(void);
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
+#endif /* __ASSEMBLY__ */
+#endif
--- /dev/null
+#ifndef _ASM_SWIOTLB_H
+
+#include "../../swiotlb.h"
+
+dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
+ int dir);
+
+#endif /* _ASM_SWIOTLB_H */
--- /dev/null
+#ifndef __XEN_SYNCH_BITOPS_H__
+#define __XEN_SYNCH_BITOPS_H__
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ * Heavily modified to provide guaranteed strong synchronisation
+ * when communicating with Xen or other guest OSes running on other CPUs.
+ */
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define ADDR (*(volatile long *) addr)
+
+static __inline__ void synch_set_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btsl %1,%0"
+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_clear_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btrl %1,%0"
+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_change_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__ (
+ "lock btcl %1,%0"
+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "lock btsl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+ __asm__ __volatile__ (
+ "lock btrl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
+{
+ int oldbit;
+
+ __asm__ __volatile__ (
+ "lock btcl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+struct __synch_xchg_dummy { unsigned long a[100]; };
+#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
+
+#define synch_cmpxchg(ptr, old, new) \
+((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
+ (unsigned long)(old), \
+ (unsigned long)(new), \
+ sizeof(*(ptr))))
+
+static inline unsigned long __synch_cmpxchg(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ __asm__ __volatile__("lock; cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ __asm__ __volatile__("lock; cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+#ifdef CONFIG_X86_64
+ case 4:
+ __asm__ __volatile__("lock; cmpxchgl %k1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+ case 8:
+ __asm__ __volatile__("lock; cmpxchgq %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+#else
+ case 4:
+ __asm__ __volatile__("lock; cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__synch_xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+#endif
+ }
+ return old;
+}
+
+#define synch_test_bit test_bit
+
+#define synch_cmpxchg_subword synch_cmpxchg
+
+#endif /* __XEN_SYNCH_BITOPS_H__ */
--- /dev/null
+#ifndef _ASM_X86_SYSTEM_H_
+#define _ASM_X86_SYSTEM_H_
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeature.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+#include <asm/hypervisor.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/* entries in ARCH_DLINFO: */
+#ifdef CONFIG_IA32_EMULATION
+# define AT_VECTOR_SIZE_ARCH 2
+#else
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
+#ifdef CONFIG_X86_32
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+struct task_struct *__switch_to(struct task_struct *prev,
+ struct task_struct *next);
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last) \
+do { \
+ /* \
+ * Context-switching clobbers all registers, so we clobber \
+ * them explicitly, via unused output variables. \
+ * (EAX and EBP is not listed because EBP is saved/restored \
+ * explicitly for wchan access and EAX is the return value of \
+ * __switch_to()) \
+ */ \
+ unsigned long ebx, ecx, edx, esi, edi; \
+ \
+ asm volatile("pushfl\n\t" /* save flags */ \
+ "pushl %%ebp\n\t" /* save EBP */ \
+ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
+ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
+ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
+ "pushl %[next_ip]\n\t" /* restore EIP */ \
+ "jmp __switch_to\n" /* regparm call */ \
+ "1:\t" \
+ "popl %%ebp\n\t" /* restore EBP */ \
+ "popfl\n" /* restore flags */ \
+ \
+ /* output parameters */ \
+ : [prev_sp] "=m" (prev->thread.sp), \
+ [prev_ip] "=m" (prev->thread.ip), \
+ "=a" (last), \
+ \
+ /* clobbered output registers: */ \
+ "=b" (ebx), "=c" (ecx), "=d" (edx), \
+ "=S" (esi), "=D" (edi) \
+ \
+ /* input parameters: */ \
+ : [next_sp] "m" (next->thread.sp), \
+ [next_ip] "m" (next->thread.ip), \
+ \
+ /* regparm parameters for __switch_to(): */ \
+ [prev] "a" (prev), \
+ [next] "d" (next)); \
+} while (0)
+
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#else
+#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
+#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER \
+ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+ "r12", "r13", "r14", "r15"
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+ asm volatile(SAVE_CONTEXT \
+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
+ "call __switch_to\n\t" \
+ ".globl thread_return\n" \
+ "thread_return:\n\t" \
+ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
+ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "movq %%rax,%%rdi\n\t" \
+ "jc ret_from_fork\n\t" \
+ RESTORE_CONTEXT \
+ : "=a" (last) \
+ : [next] "S" (next), [prev] "D" (prev), \
+ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+ [ti_flags] "i" (offsetof(struct thread_info, flags)), \
+ [tif_fork] "i" (TIF_FORK), \
+ [thread_info] "i" (offsetof(struct task_struct, stack)), \
+ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
+ : "memory", "cc" __EXTRA_CLOBBER)
+#endif
+
+#ifdef __KERNEL__
+#define _set_base(addr, base) do { unsigned long __pr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+ "rorl $16,%%edx\n\t" \
+ "movb %%dl,%2\n\t" \
+ "movb %%dh,%3" \
+ :"=&d" (__pr) \
+ :"m" (*((addr)+2)), \
+ "m" (*((addr)+4)), \
+ "m" (*((addr)+7)), \
+ "0" (base) \
+ ); } while (0)
+
+#define _set_limit(addr, limit) do { unsigned long __lr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+ "rorl $16,%%edx\n\t" \
+ "movb %2,%%dh\n\t" \
+ "andb $0xf0,%%dh\n\t" \
+ "orb %%dh,%%dl\n\t" \
+ "movb %%dl,%2" \
+ :"=&d" (__lr) \
+ :"m" (*(addr)), \
+ "m" (*((addr)+6)), \
+ "0" (limit) \
+ ); } while (0)
+
+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
+
+extern void load_gs_index(unsigned);
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value) \
+ asm volatile("\n" \
+ "1:\t" \
+ "movl %k0,%%" #seg "\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3:\t" \
+ "movl %k1, %%" #seg "\n\t" \
+ "jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b,3b) \
+ : :"r" (value), "r" (0))
+
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value) \
+ asm volatile("mov %%" #seg ",%0":"=rm" (value))
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+ unsigned long __limit;
+ asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+ return __limit + 1;
+}
+
+static inline void xen_clts(void)
+{
+ HYPERVISOR_fpu_taskswitch(0);
+}
+
+static inline void xen_stts(void)
+{
+ HYPERVISOR_fpu_taskswitch(1);
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long xen_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void xen_write_cr0(unsigned long val)
+{
+ asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
+#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
+
+static inline unsigned long xen_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+#ifdef CONFIG_X86_32
+ return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
+#else
+ return machine_to_phys(val);
+#endif
+}
+
+static inline void xen_write_cr3(unsigned long val)
+{
+#ifdef CONFIG_X86_32
+ val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
+#else
+ val = phys_to_machine(val);
+#endif
+ asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long xen_read_cr4(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+#define xen_read_cr4_safe() xen_read_cr4()
+
+static inline void xen_write_cr4(unsigned long val)
+{
+ asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+ return 0;
+}
+
+static inline void xen_write_cr8(unsigned long val)
+{
+ BUG_ON(val);
+}
+#endif
+
+static inline void xen_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+#define read_cr0() (xen_read_cr0())
+#define write_cr0(x) (xen_write_cr0(x))
+#define read_cr2() (xen_read_cr2())
+#define write_cr2(x) (xen_write_cr2(x))
+#define read_cr3() (xen_read_cr3())
+#define write_cr3(x) (xen_write_cr3(x))
+#define read_cr4() (xen_read_cr4())
+#define read_cr4_safe() (xen_read_cr4_safe())
+#define write_cr4(x) (xen_write_cr4(x))
+#define wbinvd() (xen_wbinvd())
+#ifdef CONFIG_X86_64
+#define read_cr8() (xen_read_cr8())
+#define write_cr8(x) (xen_write_cr8(x))
+#endif
+
+/* Clear the 'TS' bit */
+#define clts() (xen_clts())
+#define stts() (xen_stts())
+
+#endif /* __KERNEL__ */
+
+static inline void clflush(volatile void *__p)
+{
+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+extern int es7000_plat;
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#else
+#define mb() asm volatile("mfence":::"memory")
+#define rmb() asm volatile("lfence":::"memory")
+#define wmb() asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier. All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads. This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies. See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * b = 2;
+ * memory_barrier();
+ * p = &b; q = p;
+ * read_barrier_depends();
+ * d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends(). However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * a = 2;
+ * memory_barrier();
+ * b = 3; y = b;
+ * read_barrier_depends();
+ * x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends() do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb() mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb() rmb()
+#else
+# define smp_rmb() barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb() wmb()
+#else
+# define smp_wmb() barrier()
+#endif
+#define smp_read_barrier_depends() read_barrier_depends()
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_read_barrier_depends() do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static inline void rdtsc_barrier(void)
+{
+ alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
+#endif
--- /dev/null
+#ifndef __ASM_SYSTEM_H
+#define __ASM_SYSTEM_H
+
+#include <asm/segment.h>
+#include <asm/cmpxchg.h>
+
+
+static inline unsigned long read_cr8(void)
+{
+ return 0;
+}
+
+static inline void write_cr8(unsigned long val)
+{
+ BUG_ON(val);
+}
+
+#include <linux/irqflags.h>
+
+#endif
--- /dev/null
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#define __flush_tlb() xen_tlb_flush()
+#define __flush_tlb_global() xen_tlb_flush()
+#define __flush_tlb_single(addr) xen_invlpg(addr)
+#define __flush_tlb_all() xen_tlb_flush()
+#define __flush_tlb_one(addr) xen_invlpg(addr)
+
+#ifdef CONFIG_X86_32
+# define TLB_FLUSH_ALL 0xffffffff
+#else
+# define TLB_FLUSH_ALL -1ULL
+#endif
+
+/*
+ * TLB flushing:
+ *
+ * - flush_tlb() flushes the current mm struct TLBs
+ * - flush_tlb_all() flushes all processes TLBs
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ *
+ * x86-64 can only flush individual pages or full VMs. For a range flush
+ * we always do the full VM. Might be worth trying if for a small
+ * range a few INVLPGs in a row are a win.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+ if (mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb();
+}
+
+#else /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+#define flush_tlb_all xen_tlb_flush_all
+#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
+#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
+#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
+
+#define flush_tlb() flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ flush_tlb_mm(vma->vm_mm);
+}
+
+#define TLBSTATE_OK 1
+#define TLBSTATE_LAZY 2
+
+#ifdef CONFIG_X86_32
+struct tlb_state {
+ struct mm_struct *active_mm;
+ int state;
+ char __cacheline_padding[L1_CACHE_BYTES-8];
+};
+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+#endif
+
+#endif /* SMP */
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+ unsigned long end)
+{
+ flush_tlb_all();
+}
+
+#endif /* _ASM_X86_TLBFLUSH_H */
--- /dev/null
+/*
+ * Access to VGA videoram
+ *
+ * (c) 1998 Martin Mares <mj@ucw.cz>
+ */
+
+#ifndef _LINUX_ASM_VGA_H_
+#define _LINUX_ASM_VGA_H_
+
+/*
+ * On the PC, we can just recalculate addresses and then
+ * access the videoram directly without any black magic.
+ */
+
+#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
+
+#define vga_readb(x) (*(x))
+#define vga_writeb(x, y) (*(y) = (x))
+
+#endif
--- /dev/null
+/******************************************************************************
+ * asm-i386/mach-xen/asm/xenoprof.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __ASM_XENOPROF_H__
+#define __ASM_XENOPROF_H__
+#ifdef CONFIG_XEN
+
+struct super_block;
+struct dentry;
+int xenoprof_create_files(struct super_block * sb, struct dentry * root);
+#define HAVE_XENOPROF_CREATE_FILES
+
+struct xenoprof_init;
+void xenoprof_arch_init_counter(struct xenoprof_init *init);
+void xenoprof_arch_counter(void);
+void xenoprof_arch_start(void);
+void xenoprof_arch_stop(void);
+
+struct xenoprof_arch_shared_buffer {
+ /* nothing */
+};
+struct xenoprof_shared_buffer;
+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_get_buffer;
+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_passive;
+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
+
+#endif /* CONFIG_XEN */
+#endif /* __ASM_XENOPROF_H__ */
--- /dev/null
+#ifdef CONFIG_X86_32
+# include "../../xor_32.h"
+#else
+# include "xor_64.h"
+#endif
--- /dev/null
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+typedef struct {
+ unsigned long a, b;
+} __attribute__((aligned(16))) xmm_store_t;
+
+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
+ tell it to do a clts before the register saving. */
+#define XMMS_SAVE \
+do { \
+ preempt_disable(); \
+ if (!(current_thread_info()->status & TS_USEDFPU)) \
+ clts(); \
+ asm volatile( \
+ "movups %%xmm0,(%1) ;\n\t" \
+ "movups %%xmm1,0x10(%1) ;\n\t" \
+ "movups %%xmm2,0x20(%1) ;\n\t" \
+ "movups %%xmm3,0x30(%1) ;\n\t" \
+ : "=&r" (cr0) \
+ : "r" (xmm_save) \
+ : "memory"); \
+} while (0)
+
+#define XMMS_RESTORE \
+do { \
+ asm volatile( \
+ "sfence ;\n\t" \
+ "movups (%1),%%xmm0 ;\n\t" \
+ "movups 0x10(%1),%%xmm1 ;\n\t" \
+ "movups 0x20(%1),%%xmm2 ;\n\t" \
+ "movups 0x30(%1),%%xmm3 ;\n\t" \
+ : \
+ : "r" (cr0), "r" (xmm_save) \
+ : "memory"); \
+ if (!(current_thread_info()->status & TS_USEDFPU)) \
+ stts(); \
+ preempt_enable(); \
+} while (0)
+
+#define OFFS(x) "16*("#x")"
+#define PF_OFFS(x) "256+16*("#x")"
+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
+#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
+#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
+#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
+#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
+#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
+#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
+#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned int lines = bytes >> 8;
+ unsigned long cr0;
+ xmm_store_t xmm_save[4];
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
+ : [inc] "r" (256UL)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] "r" (256UL)
+ : "memory");
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " addq %[inc], %[p4] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+c" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] "r" (256UL)
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ PF4(i) \
+ PF4(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ XO4(i + 1, 1) \
+ XO4(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " addq %[inc], %[p4] ;\n"
+ " addq %[inc], %[p5] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+c" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
+ [p5] "+r" (p5)
+ : [inc] "r" (256UL)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_sse = {
+ .name = "generic_sse",
+ .do_2 = xor_sse_2,
+ .do_3 = xor_sse_3,
+ .do_4 = xor_sse_4,
+ .do_5 = xor_sse_5,
+};
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ xor_speed(&xor_block_sse); \
+} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
--- /dev/null
+/*
+ * This file should contain #defines for all of the interrupt vector
+ * numbers used by this architecture.
+ *
+ * In addition, there are some standard defines:
+ *
+ * FIRST_EXTERNAL_VECTOR:
+ * The first free place for external interrupts
+ *
+ * SYSCALL_VECTOR:
+ * The IRQ vector a syscall makes the user to kernel transition
+ * under.
+ *
+ * TIMER_IRQ:
+ * The IRQ number the timer interrupt comes in at.
+ *
+ * NR_IRQS:
+ * The total number of interrupt vectors (including all the
+ * architecture specific interrupts) needed.
+ *
+ */
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR 0x20
+
+#define SYSCALL_VECTOR 0x80
+
+/*
+ * Vectors 0x20-0x2f are used for ISA interrupts.
+ */
+
+#ifndef CONFIG_XEN
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ * some of the following vectors are 'rare', they are merged
+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ * TLB, reschedule and local APIC vectors are performance-critical.
+ *
+ * Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#define SPURIOUS_APIC_VECTOR 0xff
+#define ERROR_APIC_VECTOR 0xfe
+#define INVALIDATE_TLB_VECTOR 0xfd
+#define RESCHEDULE_VECTOR 0xfc
+#define CALL_FUNCTION_VECTOR 0xfb
+
+#define THERMAL_APIC_VECTOR 0xf0
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR 0xef
+
+#define SPURIOUS_APIC_VECTOR 0xff
+#define ERROR_APIC_VECTOR 0xfe
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee)
+ * we start at 0x31 to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR 0x31
+#define FIRST_SYSTEM_VECTOR 0xef
+
+/*
+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
+ * Right now the APIC is mostly only used for SMP.
+ * 256 vectors is an architectural limit. (we can have
+ * more than 256 devices theoretically, but they will
+ * have to use shared interrupts)
+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
+ * the usable vector space is 0x20-0xff (224 vectors)
+ */
+#endif
+
+#define RESCHEDULE_VECTOR 0
+#define CALL_FUNCTION_VECTOR 1
+#define NR_IPIS 2
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS 256
+
+#define FPU_IRQ 13
+
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
+
+/*
+ * The flat IRQ space is divided into two regions:
+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
+ * if we have physical device-access privilege. This region is at the
+ * start of the IRQ space so that existing device drivers do not need
+ * to be modified to translate physical IRQ numbers into our IRQ space.
+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ * are bound using the provided bind/unbind functions.
+ */
+
+#define PIRQ_BASE 0
+#define NR_PIRQS 256
+
+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
+#define NR_DYNIRQS 256
+
+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
+#define NR_IRQ_VECTORS NR_IRQS
+
+#define pirq_to_irq(_x) ((_x) + PIRQ_BASE)
+#define irq_to_pirq(_x) ((_x) - PIRQ_BASE)
+
+#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
+#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
+
+#endif /* _ASM_IRQ_VECTORS_H */
--- /dev/null
+#ifndef __ASM_MACH_APIC_H
+#define __ASM_MACH_APIC_H
+
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_X86_64
+
+#include <asm/genapic.h>
+#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
+#define INT_DEST_MODE (genapic->int_dest_mode)
+#define TARGET_CPUS (genapic->target_cpus())
+#define apic_id_registered (genapic->apic_id_registered)
+#define init_apic_ldr (genapic->init_apic_ldr)
+#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define phys_pkg_id (genapic->phys_pkg_id)
+#define vector_allocation_domain (genapic->vector_allocation_domain)
+extern void setup_apic_routing(void);
+
+#else
+
+#ifdef CONFIG_SMP
+#define TARGET_CPUS cpu_online_map
+#else
+#define TARGET_CPUS cpumask_of_cpu(0)
+#endif
+
+#define INT_DELIVERY_MODE dest_LowestPrio
+#define INT_DEST_MODE 1
+
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic;
+}
+
+static inline void setup_apic_routing(void)
+{
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return 0;
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+ return 0;
+}
+
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ return cpus_addr(cpumask)[0];
+}
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* __ASM_MACH_APIC_H */
--- /dev/null
+#include "../mc146818rtc_32.h"
+#include "../mach-default/mach_time.h"
--- /dev/null
+#include "../mach-default/mach_timer.h"
--- /dev/null
+/*
+ * include/asm-xen/asm-i386/mach-xen/mach_traps.h
+ *
+ * Machine specific NMI handling for Xen
+ */
+#ifndef _MACH_TRAPS_H
+#define _MACH_TRAPS_H
+
+#include <linux/bitops.h>
+#include <xen/interface/nmi.h>
+
+static inline void clear_mem_error(unsigned char reason) {}
+static inline void clear_io_check_error(unsigned char reason) {}
+
+static inline unsigned char get_nmi_reason(void)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ unsigned char reason = 0;
+
+ /* construct a value which looks like it came from
+ * port 0x61.
+ */
+ if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
+ reason |= 0x40;
+ if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
+ reason |= 0x80;
+
+ return reason;
+}
+
+static inline void reassert_nmi(void) {}
+
+#endif /* !_MACH_TRAPS_H */
--- /dev/null
+/* Hook to call BIOS initialisation function */
+
+#define ARCH_SETUP machine_specific_arch_setup();
+
+void __init machine_specific_arch_setup(void);
--- /dev/null
+/**
+ * machine_specific_* - Hooks for machine specific setup.
+ *
+ * Description:
+ * This is included late in kernel/setup.c so that it can make
+ * use of all of the static functions.
+ **/
+
+#include <xen/interface/callback.h>
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+static void __init machine_specific_arch_setup(void)
+{
+ int ret;
+ static struct callback_register __initdata event = {
+ .type = CALLBACKTYPE_event,
+ .address = (unsigned long) hypervisor_callback,
+ };
+ static struct callback_register __initdata failsafe = {
+ .type = CALLBACKTYPE_failsafe,
+ .address = (unsigned long)failsafe_callback,
+ };
+ static struct callback_register __initdata syscall = {
+ .type = CALLBACKTYPE_syscall,
+ .address = (unsigned long)system_call,
+ };
+ static struct callback_register __initdata nmi_cb = {
+ .type = CALLBACKTYPE_nmi,
+ .address = (unsigned long)nmi,
+ };
+
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
+ if (ret == 0)
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+ if (ret == 0)
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (ret == -ENOSYS)
+ ret = HYPERVISOR_set_callbacks(
+ event.address,
+ failsafe.address,
+ syscall.address);
+#endif
+ BUG_ON(ret);
+
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (ret == -ENOSYS) {
+ static struct xennmi_callback __initdata cb = {
+ .handler_address = (unsigned long)nmi
+ };
+
+ HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
+ }
+#endif
+}
--- /dev/null
+/* Hook to call BIOS initialisation function */
+
+#define ARCH_SETUP machine_specific_arch_setup();
+
+static void __init machine_specific_arch_setup(void);
rwlock_t ldtlock;
#endif
int size;
+#ifdef CONFIG_XEN
+ unsigned has_foreign_mappings:1;
+#endif
struct mutex lock;
void *vdso;
} mm_context_t;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
void leave_mm(int cpu);
#else
static inline void leave_mm(int cpu)
#include <asm/irq.h>
#include <asm/io.h>
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
/**
* do_nmi_callback
*
#define nmi_watchdog_default() do {} while (0)
#endif
+#ifdef ARCH_HAS_NMI_WATCHDOG
+
extern int check_nmi_watchdog(void);
extern int nmi_watchdog_enabled;
-extern int unknown_nmi_panic;
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int avail_to_resrv_perfctr_nmi(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
void __trigger_all_cpu_backtrace(void);
#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
-#endif
-
void lapic_watchdog_stop(void);
int lapic_watchdog_init(unsigned nmi_hz);
int lapic_wd_event(unsigned nmi_hz);
int lapic_watchdog_ok(void);
void disable_lapic_nmi_watchdog(void);
void enable_lapic_nmi_watchdog(void);
+
+#endif
+
void stop_nmi(void);
void restart_nmi(void);
+extern int unknown_nmi_panic;
+
#endif
extern void user_disable_single_step(struct task_struct *);
extern void user_enable_block_step(struct task_struct *);
-#ifdef CONFIG_X86_DEBUGCTLMSR
+#if defined(CONFIG_XEN)
+#define arch_has_block_step() (0)
+#elif defined(CONFIG_X86_DEBUGCTLMSR)
#define arch_has_block_step() (1)
#else
#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
* returns.
*/
#define sg_dma_address(sg) ((sg)->dma_address)
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
# define sg_dma_len(sg) ((sg)->length)
#else
# define sg_dma_len(sg) ((sg)->dma_length)
#define TIF_DEBUGCTLMSR 22 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 23 /* uses thread_struct.ds_area_msr */
#define TIF_BTS_TRACE_TS 24 /* record scheduling event timestamps */
+#ifdef CONFIG_XEN
+#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */
+#endif
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
+#define _TIF_CSTAR (1 << TIF_CSTAR)
/* work to do on interrupt/exception return */
#define _TIF_WORK_MASK \
#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
/* flags to check in __switch_to() */
+#ifndef CONFIG_XEN
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR | \
_TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS)
+#else
+#define _TIF_WORK_CTXSW \
+ (_TIF_NOTSC /*todo | _TIF_DEBUGCTLMSR | \
+ _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
+#endif
#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG)
(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
/* flags to check in __switch_to() */
+#ifndef CONFIG_XEN
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS|_TIF_NOTSC)
+#else
+#define _TIF_WORK_CTXSW \
+ (/*todo _TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS|*/_TIF_NOTSC)
+#endif
#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_XEN
+extern int xen_independent_wallclock(void);
+extern unsigned long xen_read_persistent_clock(void);
+extern int xen_update_persistent_clock(void);
+#endif
+
#endif
#define __XEN_PUBLIC_ARCH_X86_32_H__
#ifdef __XEN__
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
#else
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
typedef type * __guest_handle_ ## name
#endif
-#define DEFINE_GUEST_HANDLE_STRUCT(name) \
- __DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name) __guest_handle_ ## name
+#define DEFINE_XEN_GUEST_HANDLE_STRUCT(name) \
+ __DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
#ifdef __XEN__
#if defined(__i386__)
#endif
#ifndef __ASSEMBLY__
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint, unsigned int);
-__DEFINE_GUEST_HANDLE(ulong, unsigned long);
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(long);
-DEFINE_GUEST_HANDLE(void);
+typedef unsigned long xen_pfn_t;
+typedef unsigned long xen_ulong_t;
#endif
/*
uint16_t cs; /* code selector */
unsigned long address; /* code offset */
};
-DEFINE_GUEST_HANDLE_STRUCT(trap_info);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(trap_info);
struct cpu_user_regs {
uint32_t ebx;
uint16_t fs, _pad4;
uint16_t gs, _pad5;
};
-DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
unsigned long failsafe_callback_eip;
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
};
-DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_guest_context);
struct arch_shared_info {
unsigned long max_pfn; /* max pfn that appears in table */
unsigned long cs;
unsigned long eip;
};
+typedef struct xen_callback xen_callback_t;
#endif /* !__ASSEMBLY__ */
/*
struct aio_ring_info ring_info;
struct delayed_work wq;
+#ifdef CONFIG_EPOLL
+ // poll integration
+ wait_queue_head_t poll_wait;
+ struct file *file;
+#endif
};
/* prototypes */
4484:.balign 4 ; \
.popsection ;
-#define ELFNOTE(name, type, desc) \
+#define ELFNOTE(name, type, desc...) \
ELFNOTE_START(name, type, "") \
desc ; \
ELFNOTE_END
}
#endif /* CONFIG_GENERIC_HARDIRQS */
+#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
+int irq_ignore_unhandled(unsigned int irq);
+#else
+#define irq_ignore_unhandled(irq) 0
+#endif
+
#ifndef __ARCH_SET_SOFTIRQ_PENDING
#define set_softirq_pending(x) (local_softirq_pending() = (x))
#define or_softirq_pending(x) (local_softirq_pending() |= (x))
KEXEC_CORE_NOTE_NAME_BYTES + \
KEXEC_CORE_NOTE_DESC_BYTES )
+#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page) page_to_pfn(page)
+#define kexec_pfn_to_page(pfn) pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
/*
* This structure is used to hold the arguments that are used when loading
* kernel binaries.
extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
extern int machine_kexec_prepare(struct kimage *image);
extern void machine_kexec_cleanup(struct kimage *image);
+#ifdef CONFIG_XEN
+extern int xen_machine_kexec_load(struct kimage *image);
+extern void xen_machine_kexec_unload(struct kimage *image);
+extern void xen_machine_kexec_setup_resources(void);
+extern void xen_machine_kexec_register_resources(struct resource *res);
+#endif
extern asmlinkage long sys_kexec_load(unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments,
#define kexec_flush_icache_page(page)
#endif
+#ifndef kexec_flush_icache_page
+#define kexec_flush_icache_page(page)
+#endif
+
#define KEXEC_ON_CRASH 0x00000001
#define KEXEC_ARCH_MASK 0xffff0000
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
+#ifdef CONFIG_XEN
+#define VM_FOREIGN 0x00200000 /* Has pages belonging to another VM */
+#endif
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
+#ifdef CONFIG_XEN
+ /* Area-specific function for clearing the PTE at @ptep. Returns the
+ * original value of @ptep. */
+ pte_t (*zap_pte)(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, int is_fullmm);
+#endif
#ifdef CONFIG_NUMA
/*
* set_policy() op must add a reference to any non-NULL @new mempolicy
#include <linux/types.h>
#include <linux/spinlock.h>
#include <asm/atomic.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/xenoprof.h>
+#endif
/* Each escaped entry is prefixed by ESCAPE_CODE
* then one of the following codes, then the
#define CPU_SWITCH_CODE 2
#define COOKIE_SWITCH_CODE 3
#define KERNEL_ENTER_SWITCH_CODE 4
-#define KERNEL_EXIT_SWITCH_CODE 5
+#define USER_ENTER_SWITCH_CODE 5
#define MODULE_LOADED_CODE 6
#define CTX_TGID_CODE 7
#define TRACE_BEGIN_CODE 8
#define XEN_ENTER_SWITCH_CODE 10
#define SPU_PROFILING_CODE 11
#define SPU_CTX_SWITCH_CODE 12
+#define DOMAIN_SWITCH_CODE 13
struct super_block;
struct dentry;
/* create any necessary configuration files in the oprofile fs.
* Optional. */
int (*create_files)(struct super_block * sb, struct dentry * root);
+#ifdef CONFIG_XEN
+ /* setup active domains with Xen */
+ int (*set_active)(int *active_domains, unsigned int adomains);
+ /* setup passive domains with Xen */
+ int (*set_passive)(int *passive_domains, unsigned int pdomains);
+#endif
/* Do any necessary interrupt setup. Optional. */
int (*setup)(void);
/* Do any necessary interrupt shutdown. Optional. */
/* add a backtrace entry, to be called from the ->backtrace callback */
void oprofile_add_trace(unsigned long eip);
+/* add a domain switch entry */
+int oprofile_add_domain_switch(int32_t domain_id);
/**
* Create a file of the given name as a child of the given root, with
#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
PG_uncached, /* Page has been mapped as uncached */
#endif
+#ifdef CONFIG_XEN
+ PG_foreign, /* Page is owned by foreign allocator. */
+ PG_pinned, /* Cannot alias with PG_owner_priv_1 since
+ * bad_page() checks include this bit.
+ * Should not use PG_arch_1 as that may have
+ * a different purpose elsewhere. */
+#endif
__NR_PAGEFLAGS
};
PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
__PAGEFLAG(Slab, slab)
PAGEFLAG(Checked, owner_priv_1) /* Used by some filesystems */
+#ifdef CONFIG_PARAVIRT_XEN
PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */
+#elif defined(CONFIG_XEN)
+PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, pinned)
+#endif
PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
__SETPAGEFLAG(Private, private)
CLEARPAGEFLAG(Uptodate, uptodate)
+#define PageForeign(page) test_bit(PG_foreign, &(page)->flags)
+#define SetPageForeign(_page, dtor) do { \
+ set_bit(PG_foreign, &(_page)->flags); \
+ BUG_ON((dtor) == (void (*)(struct page *))0); \
+ (_page)->index = (long)(dtor); \
+} while (0)
+#define ClearPageForeign(page) do { \
+ clear_bit(PG_foreign, &(page)->flags); \
+ (page)->index = 0; \
+} while (0)
+#define PageForeignDestructor(_page) \
+ ((void (*)(struct page *))(_page)->index)(_page)
+
extern void cancel_dirty_page(struct page *page, unsigned int account_size);
int test_clear_page_writeback(struct page *page);
#endif /* !PAGEFLAGS_EXTENDED */
+#if !defined(CONFIG_XEN)
+# define PAGE_FLAGS_XEN 0
+#elif defined(CONFIG_X86)
+# define PAGE_FLAGS_XEN ((1 << PG_pinned) | (1 << PG_foreign))
+#else
+# define PAGE_FLAGS_XEN (1 << PG_foreign)
+#endif
+
#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \
- 1 << PG_buddy | 1 << PG_writeback | \
+ 1 << PG_buddy | 1 << PG_writeback | PAGE_FLAGS_XEN | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active)
/*
void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
int __must_check pci_assign_resource(struct pci_dev *dev, int i);
int pci_select_bars(struct pci_dev *dev, unsigned long flags);
+#ifdef CONFIG_XEN
+void pci_restore_bars(struct pci_dev *);
+#endif
/* ROM control related routines */
void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
* @local_df: allow local fragmentation
* @cloned: Head may be cloned (check refcnt to be sure)
* @nohdr: Payload reference only, must not modify header
+ * @proto_data_valid: Protocol data validated since arriving at localhost
+ * @proto_csum_blank: Protocol csum must be added before leaving localhost
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ip_summed: Driver fed us an IP checksum
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
- /* 14 bit hole */
+#ifdef CONFIG_XEN
+ __u8 proto_data_valid:1,
+ proto_csum_blank:1;
+#endif
+ /* 12 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
}
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
+
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
+int skb_checksum_setup(struct sk_buff *skb);
+#else
+static inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
+#endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
CTL_BUS=8, /* Busses */
CTL_ABI=9, /* Binary emulation */
CTL_CPU=10, /* CPU stuff (speed scaling, etc) */
+ CTL_XEN=123, /* Xen info and control */
CTL_ARLAN=254, /* arlan wireless driver */
CTL_S390DBF=5677, /* s390 debug */
CTL_SUNRPC=7249, /* sunrpc debug */
#else
#define MODULE_VERMAGIC_MODVERSIONS ""
#endif
+#ifdef CONFIG_XEN
+#define MODULE_VERMAGIC_XEN "Xen "
+#else
+#define MODULE_VERMAGIC_XEN
+#endif
#ifndef MODULE_ARCH_VERMAGIC
#define MODULE_ARCH_VERMAGIC ""
#endif
UTS_RELEASE " " \
MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \
MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \
- MODULE_ARCH_VERMAGIC
+ MODULE_VERMAGIC_XEN MODULE_ARCH_VERMAGIC
#include <linux/spinlock.h>
-#if 0
+#ifndef CONFIG_PARAVIRT_XEN
/*
* Inform the balloon driver that it should allow some slop for device-driver
* memory activities.
struct page **alloc_empty_pages_and_pagevec(int nr_pages);
void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
+/* Free an empty page range (not allocated through
+ alloc_empty_pages_and_pagevec), adding to the balloon. */
+void free_empty_pages(struct page **pagevec, int nr_pages);
+
void balloon_release_driver_page(struct page *page);
/*
--- /dev/null
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol. Used to generate ring structs which contain
+ * the elements common to all protocols only. This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places. */
+struct blkif_common_request {
+ char dummy;
+};
+struct blkif_common_response {
+ char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_response {
+ uint64_t id; /* copied from request */
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t __attribute__((__aligned__(8))) id;
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_response {
+ uint64_t __attribute__((__aligned__(8))) id;
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+union blkif_back_rings {
+ blkif_back_ring_t native;
+ blkif_common_back_ring_t common;
+ blkif_x86_32_back_ring_t x86_32;
+ blkif_x86_64_back_ring_t x86_64;
+};
+typedef union blkif_back_rings blkif_back_rings_t;
+
+enum blkif_protocol {
+ BLKIF_PROTOCOL_NATIVE = 1,
+ BLKIF_PROTOCOL_X86_32 = 2,
+ BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->sector_number = src->sector_number;
+ if (n > src->nr_segments)
+ n = src->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->sector_number = src->sector_number;
+ if (n > src->nr_segments)
+ n = src->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
+ * Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __LINUX_XEN_COMPAT_H__
+#define __LINUX_XEN_COMPAT_H__
+
+#include <linux/compat.h>
+
+extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg);
+struct privcmd_mmap_32 {
+ int num;
+ domid_t dom;
+ compat_uptr_t entry;
+};
+
+struct privcmd_mmapbatch_32 {
+ int num; /* number of pages to populate */
+ domid_t dom; /* target domain */
+ __u64 addr; /* virtual address */
+ compat_uptr_t arr; /* array of mfns - top nibble set on err */
+};
+#define IOCTL_PRIVCMD_MMAP_32 \
+ _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
+#define IOCTL_PRIVCMD_MMAPBATCH_32 \
+ _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
+
+#endif /* __LINUX_XEN_COMPAT_H__ */
--- /dev/null
+#ifndef __XEN_CPU_HOTPLUG_H__
+#define __XEN_CPU_HOTPLUG_H__
+
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+
+#if defined(CONFIG_X86) && defined(CONFIG_SMP)
+extern cpumask_t cpu_initialized_map;
+#endif
+
+#if defined(CONFIG_HOTPLUG_CPU)
+
+int cpu_up_check(unsigned int cpu);
+void init_xenbus_allowed_cpumask(void);
+int smp_suspend(void);
+void smp_resume(void);
+
+void cpu_bringup(void);
+
+#else /* !defined(CONFIG_HOTPLUG_CPU) */
+
+#define cpu_up_check(cpu) (0)
+#define init_xenbus_allowed_cpumask() ((void)0)
+
+static inline int smp_suspend(void)
+{
+ if (num_online_cpus() > 1) {
+ printk(KERN_WARNING "Can't suspend SMP guests "
+ "without CONFIG_HOTPLUG_CPU\n");
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static inline void smp_resume(void)
+{
+}
+
+#endif /* !defined(CONFIG_HOTPLUG_CPU) */
+
+#endif /* __XEN_CPU_HOTPLUG_H__ */
--- /dev/null
+
+#ifndef __ASM_XEN_DRIVER_UTIL_H__
+#define __ASM_XEN_DRIVER_UTIL_H__
+
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+
+extern struct class *get_xen_class(void);
+
+#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
--- /dev/null
+/******************************************************************************
+ * evtchn.h
+ *
+ * Communication via Xen event channels.
+ * Also definitions for the device that demuxes notifications to userspace.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_EVTCHN_H__
+#define __ASM_EVTCHN_H__
+
+#include <linux/interrupt.h>
+#include <asm/hypervisor.h>
+#include <asm/ptrace.h>
+#include <asm/synch_bitops.h>
+#include <xen/interface/event_channel.h>
+#include <linux/smp.h>
+
+/*
+ * LOW-LEVEL DEFINITIONS
+ */
+
+/*
+ * Dynamically bind an event source to an IRQ-like callback handler.
+ * On some platforms this may not be implemented via the Linux IRQ subsystem.
+ * The IRQ argument passed to the callback handler is the same as returned
+ * from the bind call. It may not correspond to a Linux IRQ number.
+ * Returns IRQ or negative errno.
+ */
+int bind_caller_port_to_irqhandler(
+ unsigned int caller_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
+int bind_listening_port_to_irqhandler(
+ unsigned int remote_domain,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
+int bind_interdomain_evtchn_to_irqhandler(
+ unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
+int bind_virq_to_irqhandler(
+ unsigned int virq,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
+int bind_ipi_to_irqhandler(
+ unsigned int ipi,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
+
+/*
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (except for bindings
+ * made with bind_caller_port_to_irqhandler()).
+ */
+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+
+void irq_resume(void);
+
+/* Entry point for notifications into Linux subsystems. */
+asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
+
+/* Entry point for notifications into the userland character device. */
+void evtchn_device_upcall(int port);
+
+void mask_evtchn(int port);
+void disable_all_local_evtchn(void);
+void unmask_evtchn(int port);
+
+#ifdef CONFIG_SMP
+void rebind_evtchn_to_cpu(int port, unsigned int cpu);
+#else
+#define rebind_evtchn_to_cpu(port, cpu) ((void)0)
+#endif
+
+static inline int test_and_set_evtchn_mask(int port)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ return synch_test_and_set_bit(port, s->evtchn_mask);
+}
+
+static inline void clear_evtchn(int port)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ synch_clear_bit(port, s->evtchn_pending);
+}
+
+static inline void notify_remote_via_evtchn(int port)
+{
+ struct evtchn_send send = { .port = port };
+ VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
+}
+
+/*
+ * Use these to access the event channel underlying the IRQ handle returned
+ * by bind_*_to_irqhandler().
+ */
+void notify_remote_via_irq(int irq);
+int irq_to_evtchn_port(int irq);
+
+#endif /* __ASM_EVTCHN_H__ */
#define __XEN_FEATURES_H__
#include <xen/interface/features.h>
+#include <xen/interface/version.h>
void xen_setup_features(void);
return xen_features[flag];
}
-#endif /* __ASM_XEN_FEATURES_H__ */
+#endif /* __XEN_FEATURES_H__ */
--- /dev/null
+#ifndef __XEN_FIRMWARE_H__
+#define __XEN_FIRMWARE_H__
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+void copy_edd(void);
+#endif
+
+void copy_edid(void);
+
+#endif /* __XEN_FIRMWARE_H__ */
--- /dev/null
+/******************************************************************************
+ * gnttab.h
+ *
+ * Two sets of functionality:
+ * 1. Granting foreign access to our memory reservation.
+ * 2. Accessing others' memory reservations via grant references.
+ * (i.e., mechanisms for both sender and recipient of grant references)
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_GNTTAB_H__
+#define __ASM_GNTTAB_H__
+
+#include <asm/hypervisor.h>
+#include <asm/maddr.h> /* maddr_t */
+#include <linux/mm.h>
+#include <xen/interface/grant_table.h>
+#include <xen/features.h>
+
+struct gnttab_free_callback {
+ struct gnttab_free_callback *next;
+ void (*fn)(void *);
+ void *arg;
+ u16 count;
+};
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+ int flags);
+
+/*
+ * End access through the given grant reference, iff the grant entry is no
+ * longer in use. Return 1 if the grant entry was freed, 0 if it is still in
+ * use.
+ */
+int gnttab_end_foreign_access_ref(grant_ref_t ref);
+
+/*
+ * Eventually end access through the given grant reference, and once that
+ * access has been ended, free the given page too. Access will be ended
+ * immediately iff the grant entry is not in use, otherwise it will happen
+ * some time later. page may be 0, in which case no freeing will occur.
+ */
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
+
+int gnttab_query_foreign_access(grant_ref_t ref);
+
+/*
+ * operations on reserved batches of grant references
+ */
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
+
+void gnttab_free_grant_reference(grant_ref_t ref);
+
+void gnttab_free_grant_references(grant_ref_t head);
+
+int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
+
+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+ grant_ref_t release);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+ void (*fn)(void *), void *arg, u16 count);
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int flags);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
+ unsigned long pfn);
+
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
+void __gnttab_dma_map_page(struct page *page);
+static inline void __gnttab_dma_unmap_page(struct page *page)
+{
+}
+
+void gnttab_reset_grant_page(struct page *page);
+
+int gnttab_suspend(void);
+int gnttab_resume(void);
+
+void *arch_gnttab_alloc_shared(unsigned long *frames);
+
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
+ uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+ if (flags & GNTMAP_contains_pte)
+ map->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ map->host_addr = __pa(addr);
+ else
+ map->host_addr = addr;
+
+ map->flags = flags;
+ map->ref = ref;
+ map->dom = domid;
+}
+
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
+ uint32_t flags, grant_handle_t handle)
+{
+ if (flags & GNTMAP_contains_pte)
+ unmap->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ unmap->host_addr = __pa(addr);
+ else
+ unmap->host_addr = addr;
+
+ unmap->handle = handle;
+ unmap->dev_bus_addr = 0;
+}
+
+static inline void
+gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
+ maddr_t new_addr, grant_handle_t handle)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ unmap->host_addr = __pa(addr);
+ unmap->new_addr = __pa(new_addr);
+ } else {
+ unmap->host_addr = addr;
+ unmap->new_addr = new_addr;
+ }
+
+ unmap->handle = handle;
+}
+
+#endif /* __ASM_GNTTAB_H__ */
--- /dev/null
+/* Simple wrappers around HVM functions */
+#ifndef XEN_HVM_H__
+#define XEN_HVM_H__
+
+#include <xen/interface/hvm/params.h>
+
+static inline unsigned long hvm_get_parameter(int idx)
+{
+ struct xen_hvm_param xhv;
+ int r;
+
+ xhv.domid = DOMID_SELF;
+ xhv.index = idx;
+ r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
+ if (r < 0) {
+ printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
+ idx, r);
+ return 0;
+ }
+ return xhv.value;
+}
+
+#endif /* XEN_HVM_H__ */
--- /dev/null
+#ifndef __XEN_HYPERCALL_H__
+#define __XEN_HYPERCALL_H__
+
+#include <asm/hypercall.h>
+
+static inline int __must_check
+HYPERVISOR_multicall_check(
+ multicall_entry_t *call_list, unsigned int nr_calls,
+ const unsigned long *rc_list)
+{
+ int rc = HYPERVISOR_multicall(call_list, nr_calls);
+
+ if (unlikely(rc < 0))
+ return rc;
+ BUG_ON(rc);
+ BUG_ON((int)nr_calls < 0);
+
+ for ( ; nr_calls > 0; --nr_calls, ++call_list)
+ if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
+ return nr_calls;
+
+ return 0;
+}
+
+/* A construct to ignore the return value of hypercall wrappers in a few
+ * exceptional cases (simply casting the function result to void doesn't
+ * avoid the compiler warning): */
+#define VOID(expr) ((void)((expr)?:0))
+
+#endif /* __XEN_HYPERCALL_H__ */
--- /dev/null
+/*
+ * copyright (c) 2006 IBM Corporation
+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _HYP_SYSFS_H_
+#define _HYP_SYSFS_H_
+
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#define HYPERVISOR_ATTR_RO(_name) \
+static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
+
+#define HYPERVISOR_ATTR_RW(_name) \
+static struct hyp_sysfs_attr _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+struct hyp_sysfs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct hyp_sysfs_attr *, char *);
+ ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
+ void *hyp_attr_data;
+};
+
+#endif /* _HYP_SYSFS_H_ */
--- /dev/null
+XEN NOTICE
+==========
+
+This copyright applies to all files within this subdirectory and its
+subdirectories:
+ include/public/*.h
+ include/public/hvm/*.h
+ include/public/io/*.h
+
+The intention is that these files can be freely copied into the source
+tree of an operating system when porting that OS to run on Xen. Doing
+so does *not* cause the OS to become subject to the terms of the GPL.
+
+All other files in the Xen source distribution are covered by version
+2 of the GNU General Public License except where explicitly stated
+otherwise within individual source files.
+
+ -- Keir Fraser (on behalf of the Xen team)
+
+=====================================================================
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- /dev/null
+/*
+ * acm.h: Xen access control module interface defintions
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005, International Business Machines Corporation.
+ */
+
+#ifndef _XEN_PUBLIC_ACM_H
+#define _XEN_PUBLIC_ACM_H
+
+#include "xen.h"
+
+/* if ACM_DEBUG defined, all hooks should
+ * print a short trace message (comment it out
+ * when not in testing mode )
+ */
+/* #define ACM_DEBUG */
+
+#ifdef ACM_DEBUG
+# define printkd(fmt, args...) printk(fmt,## args)
+#else
+# define printkd(fmt, args...)
+#endif
+
+/* default ssid reference value if not supplied */
+#define ACM_DEFAULT_SSID 0x0
+#define ACM_DEFAULT_LOCAL_SSID 0x0
+
+/* Internal ACM ERROR types */
+#define ACM_OK 0
+#define ACM_UNDEF -1
+#define ACM_INIT_SSID_ERROR -2
+#define ACM_INIT_SOID_ERROR -3
+#define ACM_ERROR -4
+
+/* External ACCESS DECISIONS */
+#define ACM_ACCESS_PERMITTED 0
+#define ACM_ACCESS_DENIED -111
+#define ACM_NULL_POINTER_ERROR -200
+
+/*
+ Error codes reported in when trying to test for a new policy
+ These error codes are reported in an array of tuples where
+ each error code is followed by a parameter describing the error
+ more closely, such as a domain id.
+*/
+#define ACM_EVTCHN_SHARING_VIOLATION 0x100
+#define ACM_GNTTAB_SHARING_VIOLATION 0x101
+#define ACM_DOMAIN_LOOKUP 0x102
+#define ACM_CHWALL_CONFLICT 0x103
+#define ACM_SSIDREF_IN_USE 0x104
+
+
+/* primary policy in lower 4 bits */
+#define ACM_NULL_POLICY 0
+#define ACM_CHINESE_WALL_POLICY 1
+#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
+#define ACM_POLICY_UNDEFINED 15
+
+/* combinations have secondary policy component in higher 4bit */
+#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
+ ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
+
+/* policy: */
+#define ACM_POLICY_NAME(X) \
+ ((X) == (ACM_NULL_POLICY)) ? "NULL" : \
+ ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" : \
+ ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
+ ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
+ "UNDEFINED"
+
+/* the following policy versions must be increased
+ * whenever the interpretation of the related
+ * policy's data structure changes
+ */
+#define ACM_POLICY_VERSION 3
+#define ACM_CHWALL_VERSION 1
+#define ACM_STE_VERSION 1
+
+/* defines a ssid reference used by xen */
+typedef uint32_t ssidref_t;
+
+/* hooks that are known to domains */
+#define ACMHOOK_none 0
+#define ACMHOOK_sharing 1
+
+/* -------security policy relevant type definitions-------- */
+
+/* type identifier; compares to "equal" or "not equal" */
+typedef uint16_t domaintype_t;
+
+/* CHINESE WALL POLICY DATA STRUCTURES
+ *
+ * current accumulated conflict type set:
+ * When a domain is started and has a type that is in
+ * a conflict set, the conflicting types are incremented in
+ * the aggregate set. When a domain is destroyed, the
+ * conflicting types to its type are decremented.
+ * If a domain has multiple types, this procedure works over
+ * all those types.
+ *
+ * conflict_aggregate_set[i] holds the number of
+ * running domains that have a conflict with type i.
+ *
+ * running_types[i] holds the number of running domains
+ * that include type i in their ssidref-referenced type set
+ *
+ * conflict_sets[i][j] is "0" if type j has no conflict
+ * with type i and is "1" otherwise.
+ */
+/* high-16 = version, low-16 = check magic */
+#define ACM_MAGIC 0x0001debc
+
+/* each offset in bytes from start of the struct they
+ * are part of */
+
+/* V3 of the policy buffer aded a version structure */
+struct acm_policy_version
+{
+ uint32_t major;
+ uint32_t minor;
+};
+
+
+/* each buffer consists of all policy information for
+ * the respective policy given in the policy code
+ *
+ * acm_policy_buffer, acm_chwall_policy_buffer,
+ * and acm_ste_policy_buffer need to stay 32-bit aligned
+ * because we create binary policies also with external
+ * tools that assume packed representations (e.g. the java tool)
+ */
+struct acm_policy_buffer {
+ uint32_t policy_version; /* ACM_POLICY_VERSION */
+ uint32_t magic;
+ uint32_t len;
+ uint32_t policy_reference_offset;
+ uint32_t primary_policy_code;
+ uint32_t primary_buffer_offset;
+ uint32_t secondary_policy_code;
+ uint32_t secondary_buffer_offset;
+ struct acm_policy_version xml_pol_version; /* add in V3 */
+};
+
+
+struct acm_policy_reference_buffer {
+ uint32_t len;
+};
+
+struct acm_chwall_policy_buffer {
+ uint32_t policy_version; /* ACM_CHWALL_VERSION */
+ uint32_t policy_code;
+ uint32_t chwall_max_types;
+ uint32_t chwall_max_ssidrefs;
+ uint32_t chwall_max_conflictsets;
+ uint32_t chwall_ssid_offset;
+ uint32_t chwall_conflict_sets_offset;
+ uint32_t chwall_running_types_offset;
+ uint32_t chwall_conflict_aggregate_offset;
+};
+
+struct acm_ste_policy_buffer {
+ uint32_t policy_version; /* ACM_STE_VERSION */
+ uint32_t policy_code;
+ uint32_t ste_max_types;
+ uint32_t ste_max_ssidrefs;
+ uint32_t ste_ssid_offset;
+};
+
+struct acm_stats_buffer {
+ uint32_t magic;
+ uint32_t len;
+ uint32_t primary_policy_code;
+ uint32_t primary_stats_offset;
+ uint32_t secondary_policy_code;
+ uint32_t secondary_stats_offset;
+};
+
+struct acm_ste_stats_buffer {
+ uint32_t ec_eval_count;
+ uint32_t gt_eval_count;
+ uint32_t ec_denied_count;
+ uint32_t gt_denied_count;
+ uint32_t ec_cachehit_count;
+ uint32_t gt_cachehit_count;
+};
+
+struct acm_ssid_buffer {
+ uint32_t len;
+ ssidref_t ssidref;
+ uint32_t policy_reference_offset;
+ uint32_t primary_policy_code;
+ uint32_t primary_max_types;
+ uint32_t primary_types_offset;
+ uint32_t secondary_policy_code;
+ uint32_t secondary_max_types;
+ uint32_t secondary_types_offset;
+};
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * acm_ops.h: Xen access control module hypervisor commands
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005,2006 International Business Machines Corporation.
+ */
+
+#ifndef __XEN_PUBLIC_ACM_OPS_H__
+#define __XEN_PUBLIC_ACM_OPS_H__
+
+#include "xen.h"
+#include "acm.h"
+
+/*
+ * Make sure you increment the interface version whenever you modify this file!
+ * This makes sure that old versions of acm tools will stop working in a
+ * well-defined way (rather than crashing the machine, for instance).
+ */
+#define ACM_INTERFACE_VERSION 0xAAAA000A
+
+/************************************************************************/
+
+/*
+ * Prototype for this hypercall is:
+ * int acm_op(int cmd, void *args)
+ * @cmd == ACMOP_??? (access control module operation).
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+
+#define ACMOP_setpolicy 1
+struct acm_setpolicy {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) pushcache;
+ uint32_t pushcache_size;
+};
+
+
+#define ACMOP_getpolicy 2
+struct acm_getpolicy {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) pullcache;
+ uint32_t pullcache_size;
+};
+
+
+#define ACMOP_dumpstats 3
+struct acm_dumpstats {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) pullcache;
+ uint32_t pullcache_size;
+};
+
+
+#define ACMOP_getssid 4
+#define ACM_GETBY_ssidref 1
+#define ACM_GETBY_domainid 2
+struct acm_getssid {
+ /* IN */
+ uint32_t get_ssid_by; /* ACM_GETBY_* */
+ union {
+ domaintype_t domainid;
+ ssidref_t ssidref;
+ } id;
+ XEN_GUEST_HANDLE_64(void) ssidbuf;
+ uint32_t ssidbuf_size;
+};
+
+#define ACMOP_getdecision 5
+struct acm_getdecision {
+ /* IN */
+ uint32_t get_decision_by1; /* ACM_GETBY_* */
+ uint32_t get_decision_by2; /* ACM_GETBY_* */
+ union {
+ domaintype_t domainid;
+ ssidref_t ssidref;
+ } id1;
+ union {
+ domaintype_t domainid;
+ ssidref_t ssidref;
+ } id2;
+ uint32_t hook;
+ /* OUT */
+ uint32_t acm_decision;
+};
+
+
+#define ACMOP_chgpolicy 6
+struct acm_change_policy {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) policy_pushcache;
+ uint32_t policy_pushcache_size;
+ XEN_GUEST_HANDLE_64(void) del_array;
+ uint32_t delarray_size;
+ XEN_GUEST_HANDLE_64(void) chg_array;
+ uint32_t chgarray_size;
+ /* OUT */
+ /* array with error code */
+ XEN_GUEST_HANDLE_64(void) err_array;
+ uint32_t errarray_size;
+};
+
+#define ACMOP_relabeldoms 7
+struct acm_relabel_doms {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) relabel_map;
+ uint32_t relabel_map_size;
+ /* OUT */
+ XEN_GUEST_HANDLE_64(void) err_array;
+ uint32_t errarray_size;
+};
+
+/* future interface to Xen */
+struct xen_acmctl {
+ uint32_t cmd;
+ uint32_t interface_version;
+ union {
+ struct acm_setpolicy setpolicy;
+ struct acm_getpolicy getpolicy;
+ struct acm_dumpstats dumpstats;
+ struct acm_getssid getssid;
+ struct acm_getdecision getdecision;
+ struct acm_change_policy change_policy;
+ struct acm_relabel_doms relabel_doms;
+ } u;
+};
+
+typedef struct xen_acmctl xen_acmctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_acmctl_t);
+
+#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * arch-x86/cpuid.h
+ *
+ * CPUID interface to Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007 Citrix Systems, Inc.
+ *
+ * Authors:
+ * Keir Fraser <keir.fraser@citrix.com>
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
+#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
+
+/* Xen identification leaves start at 0x40000000. */
+#define XEN_CPUID_FIRST_LEAF 0x40000000
+#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i))
+
+/*
+ * Leaf 1 (0x40000000)
+ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
+ * are supported by the Xen host.
+ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
+ * of a Xen host.
+ */
+#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
+#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
+#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
+
+/*
+ * Leaf 2 (0x40000001)
+ * EAX[31:16]: Xen major version.
+ * EAX[15: 0]: Xen minor version.
+ * EBX-EDX: Reserved (currently all zeroes).
+ */
+
+/*
+ * Leaf 3 (0x40000002)
+ * EAX: Number of hypercall transfer pages. This register is always guaranteed
+ * to specify one hypercall page.
+ * EBX: Base address of Xen-specific MSRs.
+ * ECX: Features 1. Unused bits are set to zero.
+ * EDX: Features 2. Unused bits are set to zero.
+ */
+
+/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
+#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
+#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0)
+
+#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
--- /dev/null
+/*
+ * Structure definitions for HVM state that is held by Xen and must
+ * be saved along with the domain's memory and device-model state.
+ *
+ * Copyright (c) 2007 XenSource Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_SAVE_X86_H__
+#define __XEN_PUBLIC_HVM_SAVE_X86_H__
+
+/*
+ * Save/restore header: general info about the save file.
+ */
+
+#define HVM_FILE_MAGIC 0x54381286
+#define HVM_FILE_VERSION 0x00000001
+
+struct hvm_save_header {
+ uint32_t magic; /* Must be HVM_FILE_MAGIC */
+ uint32_t version; /* File format version */
+ uint64_t changeset; /* Version of Xen that saved this file */
+ uint32_t cpuid; /* CPUID[0x01][%eax] on the saving machine */
+ uint32_t pad0;
+};
+
+DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
+
+
+/*
+ * Processor
+ */
+
+struct hvm_hw_cpu {
+ uint8_t fpu_regs[512];
+
+ uint64_t rax;
+ uint64_t rbx;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t rsp;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+
+ uint64_t rip;
+ uint64_t rflags;
+
+ uint64_t cr0;
+ uint64_t cr2;
+ uint64_t cr3;
+ uint64_t cr4;
+
+ uint64_t dr0;
+ uint64_t dr1;
+ uint64_t dr2;
+ uint64_t dr3;
+ uint64_t dr6;
+ uint64_t dr7;
+
+ uint32_t cs_sel;
+ uint32_t ds_sel;
+ uint32_t es_sel;
+ uint32_t fs_sel;
+ uint32_t gs_sel;
+ uint32_t ss_sel;
+ uint32_t tr_sel;
+ uint32_t ldtr_sel;
+
+ uint32_t cs_limit;
+ uint32_t ds_limit;
+ uint32_t es_limit;
+ uint32_t fs_limit;
+ uint32_t gs_limit;
+ uint32_t ss_limit;
+ uint32_t tr_limit;
+ uint32_t ldtr_limit;
+ uint32_t idtr_limit;
+ uint32_t gdtr_limit;
+
+ uint64_t cs_base;
+ uint64_t ds_base;
+ uint64_t es_base;
+ uint64_t fs_base;
+ uint64_t gs_base;
+ uint64_t ss_base;
+ uint64_t tr_base;
+ uint64_t ldtr_base;
+ uint64_t idtr_base;
+ uint64_t gdtr_base;
+
+ uint32_t cs_arbytes;
+ uint32_t ds_arbytes;
+ uint32_t es_arbytes;
+ uint32_t fs_arbytes;
+ uint32_t gs_arbytes;
+ uint32_t ss_arbytes;
+ uint32_t tr_arbytes;
+ uint32_t ldtr_arbytes;
+
+ uint32_t sysenter_cs;
+ uint32_t padding0;
+
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+
+ /* msr for em64t */
+ uint64_t shadow_gs;
+
+ /* msr content saved/restored. */
+ uint64_t msr_flags;
+ uint64_t msr_lstar;
+ uint64_t msr_star;
+ uint64_t msr_cstar;
+ uint64_t msr_syscall_mask;
+ uint64_t msr_efer;
+
+ /* guest's idea of what rdtsc() would return */
+ uint64_t tsc;
+
+ /* pending event, if any */
+ union {
+ uint32_t pending_event;
+ struct {
+ uint8_t pending_vector:8;
+ uint8_t pending_type:3;
+ uint8_t pending_error_valid:1;
+ uint32_t pending_reserved:19;
+ uint8_t pending_valid:1;
+ };
+ };
+ /* error code for pending event */
+ uint32_t error_code;
+};
+
+DECLARE_HVM_SAVE_TYPE(CPU, 2, struct hvm_hw_cpu);
+
+
+/*
+ * PIC
+ */
+
+struct hvm_hw_vpic {
+ /* IR line bitmasks. */
+ uint8_t irr;
+ uint8_t imr;
+ uint8_t isr;
+
+ /* Line IRx maps to IRQ irq_base+x */
+ uint8_t irq_base;
+
+ /*
+ * Where are we in ICW2-4 initialisation (0 means no init in progress)?
+ * Bits 0-1 (=x): Next write at A=1 sets ICW(x+1).
+ * Bit 2: ICW1.IC4 (1 == ICW4 included in init sequence)
+ * Bit 3: ICW1.SNGL (0 == ICW3 included in init sequence)
+ */
+ uint8_t init_state:4;
+
+ /* IR line with highest priority. */
+ uint8_t priority_add:4;
+
+ /* Reads from A=0 obtain ISR or IRR? */
+ uint8_t readsel_isr:1;
+
+ /* Reads perform a polling read? */
+ uint8_t poll:1;
+
+ /* Automatically clear IRQs from the ISR during INTA? */
+ uint8_t auto_eoi:1;
+
+ /* Automatically rotate IRQ priorities during AEOI? */
+ uint8_t rotate_on_auto_eoi:1;
+
+ /* Exclude slave inputs when considering in-service IRQs? */
+ uint8_t special_fully_nested_mode:1;
+
+ /* Special mask mode excludes masked IRs from AEOI and priority checks. */
+ uint8_t special_mask_mode:1;
+
+ /* Is this a master PIC or slave PIC? (NB. This is not programmable.) */
+ uint8_t is_master:1;
+
+ /* Edge/trigger selection. */
+ uint8_t elcr;
+
+ /* Virtual INT output. */
+ uint8_t int_output;
+};
+
+DECLARE_HVM_SAVE_TYPE(PIC, 3, struct hvm_hw_vpic);
+
+
+/*
+ * IO-APIC
+ */
+
+#ifdef __ia64__
+#define VIOAPIC_IS_IOSAPIC 1
+#define VIOAPIC_NUM_PINS 24
+#else
+#define VIOAPIC_NUM_PINS 48 /* 16 ISA IRQs, 32 non-legacy PCI IRQS. */
+#endif
+
+struct hvm_hw_vioapic {
+ uint64_t base_address;
+ uint32_t ioregsel;
+ uint32_t id;
+ union vioapic_redir_entry
+ {
+ uint64_t bits;
+ struct {
+ uint8_t vector;
+ uint8_t delivery_mode:3;
+ uint8_t dest_mode:1;
+ uint8_t delivery_status:1;
+ uint8_t polarity:1;
+ uint8_t remote_irr:1;
+ uint8_t trig_mode:1;
+ uint8_t mask:1;
+ uint8_t reserve:7;
+#if !VIOAPIC_IS_IOSAPIC
+ uint8_t reserved[4];
+ uint8_t dest_id;
+#else
+ uint8_t reserved[3];
+ uint16_t dest_id;
+#endif
+ } fields;
+ } redirtbl[VIOAPIC_NUM_PINS];
+};
+
+DECLARE_HVM_SAVE_TYPE(IOAPIC, 4, struct hvm_hw_vioapic);
+
+
+/*
+ * LAPIC
+ */
+
+struct hvm_hw_lapic {
+ uint64_t apic_base_msr;
+ uint32_t disabled; /* VLAPIC_xx_DISABLED */
+ uint32_t timer_divisor;
+};
+
+DECLARE_HVM_SAVE_TYPE(LAPIC, 5, struct hvm_hw_lapic);
+
+struct hvm_hw_lapic_regs {
+ uint8_t data[1024];
+};
+
+DECLARE_HVM_SAVE_TYPE(LAPIC_REGS, 6, struct hvm_hw_lapic_regs);
+
+
+/*
+ * IRQs
+ */
+
+struct hvm_hw_pci_irqs {
+ /*
+ * Virtual interrupt wires for a single PCI bus.
+ * Indexed by: device*4 + INTx#.
+ */
+ union {
+ DECLARE_BITMAP(i, 32*4);
+ uint64_t pad[2];
+ };
+};
+
+DECLARE_HVM_SAVE_TYPE(PCI_IRQ, 7, struct hvm_hw_pci_irqs);
+
+struct hvm_hw_isa_irqs {
+ /*
+ * Virtual interrupt wires for ISA devices.
+ * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing).
+ */
+ union {
+ DECLARE_BITMAP(i, 16);
+ uint64_t pad[1];
+ };
+};
+
+DECLARE_HVM_SAVE_TYPE(ISA_IRQ, 8, struct hvm_hw_isa_irqs);
+
+struct hvm_hw_pci_link {
+ /*
+ * PCI-ISA interrupt router.
+ * Each PCI <device:INTx#> is 'wire-ORed' into one of four links using
+ * the traditional 'barber's pole' mapping ((device + INTx#) & 3).
+ * The router provides a programmable mapping from each link to a GSI.
+ */
+ uint8_t route[4];
+ uint8_t pad0[4];
+};
+
+DECLARE_HVM_SAVE_TYPE(PCI_LINK, 9, struct hvm_hw_pci_link);
+
+/*
+ * PIT
+ */
+
+struct hvm_hw_pit {
+ struct hvm_hw_pit_channel {
+ uint32_t count; /* can be 65536 */
+ uint16_t latched_count;
+ uint8_t count_latched;
+ uint8_t status_latched;
+ uint8_t status;
+ uint8_t read_state;
+ uint8_t write_state;
+ uint8_t write_latch;
+ uint8_t rw_mode;
+ uint8_t mode;
+ uint8_t bcd; /* not supported */
+ uint8_t gate; /* timer start */
+ } channels[3]; /* 3 x 16 bytes */
+ uint32_t speaker_data_on;
+ uint32_t pad0;
+};
+
+DECLARE_HVM_SAVE_TYPE(PIT, 10, struct hvm_hw_pit);
+
+
+/*
+ * RTC
+ */
+
+#define RTC_CMOS_SIZE 14
+struct hvm_hw_rtc {
+ /* CMOS bytes */
+ uint8_t cmos_data[RTC_CMOS_SIZE];
+ /* Index register for 2-part operations */
+ uint8_t cmos_index;
+ uint8_t pad0;
+};
+
+DECLARE_HVM_SAVE_TYPE(RTC, 11, struct hvm_hw_rtc);
+
+
+/*
+ * HPET
+ */
+
+#define HPET_TIMER_NUM 3 /* 3 timers supported now */
+struct hvm_hw_hpet {
+ /* Memory-mapped, software visible registers */
+ uint64_t capability; /* capabilities */
+ uint64_t res0; /* reserved */
+ uint64_t config; /* configuration */
+ uint64_t res1; /* reserved */
+ uint64_t isr; /* interrupt status reg */
+ uint64_t res2[25]; /* reserved */
+ uint64_t mc64; /* main counter */
+ uint64_t res3; /* reserved */
+ struct { /* timers */
+ uint64_t config; /* configuration/cap */
+ uint64_t cmp; /* comparator */
+ uint64_t fsb; /* FSB route, not supported now */
+ uint64_t res4; /* reserved */
+ } timers[HPET_TIMER_NUM];
+ uint64_t res5[4*(24-HPET_TIMER_NUM)]; /* reserved, up to 0x3ff */
+
+ /* Hidden register state */
+ uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
+};
+
+DECLARE_HVM_SAVE_TYPE(HPET, 12, struct hvm_hw_hpet);
+
+
+/*
+ * PM timer
+ */
+
+struct hvm_hw_pmtimer {
+ uint32_t tmr_val; /* PM_TMR_BLK.TMR_VAL: 32bit free-running counter */
+ uint16_t pm1a_sts; /* PM1a_EVT_BLK.PM1a_STS: status register */
+ uint16_t pm1a_en; /* PM1a_EVT_BLK.PM1a_EN: enable register */
+};
+
+DECLARE_HVM_SAVE_TYPE(PMTIMER, 13, struct hvm_hw_pmtimer);
+
+/*
+ * MTRR MSRs
+ */
+
+struct hvm_hw_mtrr {
+#define MTRR_VCNT 8
+#define NUM_FIXED_MSR 11
+ uint64_t msr_pat_cr;
+ /* mtrr physbase & physmask msr pair*/
+ uint64_t msr_mtrr_var[MTRR_VCNT*2];
+ uint64_t msr_mtrr_fixed[NUM_FIXED_MSR];
+ uint64_t msr_mtrr_cap;
+ uint64_t msr_mtrr_def_type;
+};
+
+DECLARE_HVM_SAVE_TYPE(MTRR, 14, struct hvm_hw_mtrr);
+
+/*
+ * Largest type-code in use
+ */
+#define HVM_SAVE_CODE_MAX 14
+
+#endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
--- /dev/null
+/******************************************************************************
+ * xen-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2007, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
+
+/*
+ * Hypercall interface:
+ * Input: %ebx, %ecx, %edx, %esi, %edi (arguments 1-5)
+ * Output: %eax
+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
+ * call hypercall_page + hypercall-number * 32
+ * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx)
+ */
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030203
+/*
+ * Legacy hypercall interface:
+ * As above, except the entry sequence to the hypervisor is:
+ * mov $hypercall-number*32,%eax ; int $0x82
+ */
+#define TRAP_INSTR "int $0x82"
+#endif
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS FLAT_RING3_CS
+#define FLAT_USER_DS FLAT_RING3_DS
+#define FLAT_USER_SS FLAT_RING3_SS
+
+#define __HYPERVISOR_VIRT_START_PAE 0xF5800000
+#define __MACH2PHYS_VIRT_START_PAE 0xF5800000
+#define __MACH2PHYS_VIRT_END_PAE 0xF6800000
+#define HYPERVISOR_VIRT_START_PAE \
+ mk_unsigned_long(__HYPERVISOR_VIRT_START_PAE)
+#define MACH2PHYS_VIRT_START_PAE \
+ mk_unsigned_long(__MACH2PHYS_VIRT_START_PAE)
+#define MACH2PHYS_VIRT_END_PAE \
+ mk_unsigned_long(__MACH2PHYS_VIRT_END_PAE)
+
+#define __HYPERVISOR_VIRT_START_NONPAE 0xFC000000
+#define __MACH2PHYS_VIRT_START_NONPAE 0xFC000000
+#define __MACH2PHYS_VIRT_END_NONPAE 0xFC400000
+#define HYPERVISOR_VIRT_START_NONPAE \
+ mk_unsigned_long(__HYPERVISOR_VIRT_START_NONPAE)
+#define MACH2PHYS_VIRT_START_NONPAE \
+ mk_unsigned_long(__MACH2PHYS_VIRT_START_NONPAE)
+#define MACH2PHYS_VIRT_END_NONPAE \
+ mk_unsigned_long(__MACH2PHYS_VIRT_END_NONPAE)
+
+#ifdef CONFIG_X86_PAE
+#define __HYPERVISOR_VIRT_START __HYPERVISOR_VIRT_START_PAE
+#define __MACH2PHYS_VIRT_START __MACH2PHYS_VIRT_START_PAE
+#define __MACH2PHYS_VIRT_END __MACH2PHYS_VIRT_END_PAE
+#else
+#define __HYPERVISOR_VIRT_START __HYPERVISOR_VIRT_START_NONPAE
+#define __MACH2PHYS_VIRT_START __MACH2PHYS_VIRT_START_NONPAE
+#define __MACH2PHYS_VIRT_END __MACH2PHYS_VIRT_END_NONPAE
+#endif
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
+#endif
+
+/* 32-/64-bit invariability for control interfaces (domctl/sysctl). */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+#undef ___DEFINE_XEN_GUEST_HANDLE
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+ typedef struct { type *p; } \
+ __guest_handle_ ## name; \
+ typedef struct { union { type *p; uint64_aligned_t q; }; } \
+ __guest_handle_64_ ## name
+#undef set_xen_guest_handle
+#define set_xen_guest_handle(hnd, val) \
+ do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0; \
+ (hnd).p = val; \
+ } while ( 0 )
+#define uint64_aligned_t uint64_t __attribute__((aligned(8)))
+#define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name
+#define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name)
+#endif
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t ebp;
+ uint32_t eax;
+ uint16_t error_code; /* private */
+ uint16_t entry_vector; /* private */
+ uint32_t eip;
+ uint16_t cs;
+ uint8_t saved_upcall_mask;
+ uint8_t _pad0;
+ uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
+ uint32_t esp;
+ uint16_t ss, _pad1;
+ uint16_t es, _pad2;
+ uint16_t ds, _pad3;
+ uint16_t fs, _pad4;
+ uint16_t gs, _pad5;
+};
+typedef struct cpu_user_regs cpu_user_regs_t;
+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
+};
+typedef struct arch_vcpu_info arch_vcpu_info_t;
+
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * xen-x86_64.h
+ *
+ * Guest OS interface to x86 64-bit Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
+
+/*
+ * Hypercall interface:
+ * Input: %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5)
+ * Output: %rax
+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
+ * call hypercall_page + hypercall-number * 32
+ * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi)
+ */
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030203
+/*
+ * Legacy hypercall interface:
+ * As above, except the entry sequence to the hypervisor is:
+ * mov $hypercall-number*32,%eax ; syscall
+ * Clobbered: %rcx, %r11, argument registers (as above)
+ */
+#define TRAP_INSTR "syscall"
+#endif
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000 /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ * @which == SEGBASE_* ; @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS 0
+#define SEGBASE_GS_USER 1
+#define SEGBASE_GS_KERNEL 2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ * RING0 -> RING3 kernel mode.
+ * RING1 -> RING3 kernel mode.
+ * RING2 -> RING3 kernel mode.
+ * RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ * orb $3,1*8(%rsp)
+ * iretq
+ * If flags contains VGCF_in_syscall:
+ * Restore RAX, RIP, RFLAGS, RSP.
+ * Discard R11, RCX, CS, SS.
+ * Otherwise:
+ * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+ /* Top of stack (%rsp at point of hypercall). */
+ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+ uint64_t r ## name, e ## name; \
+ uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+ uint64_t r15;
+ uint64_t r14;
+ uint64_t r13;
+ uint64_t r12;
+ __DECL_REG(bp);
+ __DECL_REG(bx);
+ uint64_t r11;
+ uint64_t r10;
+ uint64_t r9;
+ uint64_t r8;
+ __DECL_REG(ax);
+ __DECL_REG(cx);
+ __DECL_REG(dx);
+ __DECL_REG(si);
+ __DECL_REG(di);
+ uint32_t error_code; /* private */
+ uint32_t entry_vector; /* private */
+ __DECL_REG(ip);
+ uint16_t cs, _pad0[1];
+ uint8_t saved_upcall_mask;
+ uint8_t _pad1[3];
+ __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
+ __DECL_REG(sp);
+ uint16_t ss, _pad2[3];
+ uint16_t es, _pad3[3];
+ uint16_t ds, _pad4[3];
+ uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
+ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+typedef struct cpu_user_regs cpu_user_regs_t;
+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+typedef struct arch_vcpu_info arch_vcpu_info_t;
+
+typedef unsigned long xen_callback_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * arch-x86/xen.h
+ *
+ * Guest OS interface to x86 Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "../xen.h"
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_H__
+
+/* Structural guest handles introduced in 0x00030201. */
+#if __XEN_INTERFACE_VERSION__ >= 0x00030201
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+ typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+ typedef type * __guest_handle_ ## name
+#endif
+
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
+ ___DEFINE_XEN_GUEST_HANDLE(name, type); \
+ ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define __XEN_GUEST_HANDLE(name) __guest_handle_ ## name
+#define XEN_GUEST_HANDLE(name) __XEN_GUEST_HANDLE(name)
+#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
+#ifdef __XEN_TOOLS__
+#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
+#endif
+
+/* Allow co-existing Linux 2.6.23+ Xen interface definitions. */
+#define DEFINE_XEN_GUEST_HANDLE_STRUCT(name) struct name
+
+#if defined(__i386__)
+#include "xen-x86_32.h"
+#elif defined(__x86_64__)
+#include "xen-x86_64.h"
+#endif
+
+#ifndef __ASSEMBLY__
+typedef unsigned long xen_pfn_t;
+#define PRI_xen_pfn "lx"
+#endif
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE 14
+#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
+#ifndef __ASSEMBLY__
+
+typedef unsigned long xen_ulong_t;
+
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ * Level == 0: Noone may enter
+ * Level == 1: Kernel may enter
+ * Level == 2: Kernel may enter
+ * Level == 3: Everyone may enter
+ */
+#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
+#define TI_GET_IF(_ti) ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2))
+struct trap_info {
+ uint8_t vector; /* exception vector */
+ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
+ uint16_t cs; /* code selector */
+ unsigned long address; /* code offset */
+};
+typedef struct trap_info trap_info_t;
+DEFINE_XEN_GUEST_HANDLE(trap_info_t);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+struct vcpu_guest_context {
+ /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_IN_KERNEL (1<<2)
+#define _VGCF_i387_valid 0
+#define VGCF_i387_valid (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel 2
+#define VGCF_in_kernel (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events 4
+#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online 5
+#define VGCF_online (1<<_VGCF_online)
+ unsigned long flags; /* VGCF_* flags */
+ struct cpu_user_regs user_regs; /* User-level CPU registers */
+ struct trap_info trap_ctxt[256]; /* Virtual IDT */
+ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
+ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+ unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
+ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
+ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
+ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
+#ifdef __i386__
+ unsigned long event_callback_cs; /* CS:EIP of event callback */
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
+ unsigned long failsafe_callback_eip;
+#else
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_eip;
+#ifdef __XEN__
+ union {
+ unsigned long syscall_callback_eip;
+ struct {
+ unsigned int event_callback_cs; /* compat CS of event cb */
+ unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */
+ };
+ };
+#else
+ unsigned long syscall_callback_eip;
+#endif
+#endif
+ unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+ /* Segment base addresses. */
+ uint64_t fs_base;
+ uint64_t gs_base_kernel;
+ uint64_t gs_base_user;
+#endif
+};
+typedef struct vcpu_guest_context vcpu_guest_context_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
+
+struct arch_shared_info {
+ unsigned long max_pfn; /* max pfn that appears in table */
+ /* Frame containing list of mfns containing list of mfns containing p2m. */
+ xen_pfn_t pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+ uint64_t pad[32];
+};
+typedef struct arch_shared_info arch_shared_info_t;
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
+#endif
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "arch-x86/xen.h"
--- /dev/null
+/******************************************************************************
+ * arch-x86_64.h
+ *
+ * Guest OS interface to x86 64-bit Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "arch-x86/xen.h"
struct callback_register {
uint16_t type;
uint16_t flags;
- struct xen_callback address;
+ xen_callback_t address;
};
+typedef struct callback_register callback_register_t;
+DEFINE_XEN_GUEST_HANDLE(callback_register_t);
/*
* Unregister a callback.
uint16_t type;
uint16_t _unused;
};
+typedef struct callback_unregister callback_unregister_t;
+DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030207
+#undef CALLBACKTYPE_sysenter
+#define CALLBACKTYPE_sysenter CALLBACKTYPE_sysenter_deprecated
+#endif
#endif /* __XEN_PUBLIC_CALLBACK_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * dom0_ops.h
+ *
+ * Process command requests from domain-0 guest OS.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2003, B Dragovic
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_DOM0_OPS_H__
+#define __XEN_PUBLIC_DOM0_OPS_H__
+
+#include "xen.h"
+#include "platform.h"
+
+#if __XEN_INTERFACE_VERSION__ >= 0x00030204
+#error "dom0_ops.h is a compatibility interface only"
+#endif
+
+#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION
+
+#define DOM0_SETTIME XENPF_settime
+#define dom0_settime xenpf_settime
+#define dom0_settime_t xenpf_settime_t
+
+#define DOM0_ADD_MEMTYPE XENPF_add_memtype
+#define dom0_add_memtype xenpf_add_memtype
+#define dom0_add_memtype_t xenpf_add_memtype_t
+
+#define DOM0_DEL_MEMTYPE XENPF_del_memtype
+#define dom0_del_memtype xenpf_del_memtype
+#define dom0_del_memtype_t xenpf_del_memtype_t
+
+#define DOM0_READ_MEMTYPE XENPF_read_memtype
+#define dom0_read_memtype xenpf_read_memtype
+#define dom0_read_memtype_t xenpf_read_memtype_t
+
+#define DOM0_MICROCODE XENPF_microcode_update
+#define dom0_microcode xenpf_microcode_update
+#define dom0_microcode_t xenpf_microcode_update_t
+
+#define DOM0_PLATFORM_QUIRK XENPF_platform_quirk
+#define dom0_platform_quirk xenpf_platform_quirk
+#define dom0_platform_quirk_t xenpf_platform_quirk_t
+
+typedef uint64_t cpumap_t;
+
+/* Unsupported legacy operation -- defined for API compatibility. */
+#define DOM0_MSR 15
+struct dom0_msr {
+ /* IN variables. */
+ uint32_t write;
+ cpumap_t cpu_mask;
+ uint32_t msr;
+ uint32_t in1;
+ uint32_t in2;
+ /* OUT variables. */
+ uint32_t out1;
+ uint32_t out2;
+};
+typedef struct dom0_msr dom0_msr_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
+
+/* Unsupported legacy operation -- defined for API compatibility. */
+#define DOM0_PHYSICAL_MEMORY_MAP 40
+struct dom0_memory_map_entry {
+ uint64_t start, end;
+ uint32_t flags; /* reserved */
+ uint8_t is_ram;
+};
+typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
+
+struct dom0_op {
+ uint32_t cmd;
+ uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
+ union {
+ struct dom0_msr msr;
+ struct dom0_settime settime;
+ struct dom0_add_memtype add_memtype;
+ struct dom0_del_memtype del_memtype;
+ struct dom0_read_memtype read_memtype;
+ struct dom0_microcode microcode;
+ struct dom0_platform_quirk platform_quirk;
+ struct dom0_memory_map_entry physical_memory_map;
+ uint8_t pad[128];
+ } u;
+};
+typedef struct dom0_op dom0_op_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
+
+#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * domctl.h
+ *
+ * Domain management operations. For use by node control stack.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2003, B Dragovic
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_DOMCTL_H__
+#define __XEN_PUBLIC_DOMCTL_H__
+
+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
+#error "domctl operations are intended for use by node control tools only"
+#endif
+
+#include "xen.h"
+
+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000005
+
+struct xenctl_cpumap {
+ XEN_GUEST_HANDLE_64(uint8) bitmap;
+ uint32_t nr_cpus;
+};
+
+/*
+ * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
+ * If it is specified as zero, an id is auto-allocated and returned.
+ */
+#define XEN_DOMCTL_createdomain 1
+struct xen_domctl_createdomain {
+ /* IN parameters */
+ uint32_t ssidref;
+ xen_domain_handle_t handle;
+ /* Is this an HVM guest (as opposed to a PV guest)? */
+#define _XEN_DOMCTL_CDF_hvm_guest 0
+#define XEN_DOMCTL_CDF_hvm_guest (1U<<_XEN_DOMCTL_CDF_hvm_guest)
+ /* Use hardware-assisted paging if available? */
+#define _XEN_DOMCTL_CDF_hap 1
+#define XEN_DOMCTL_CDF_hap (1U<<_XEN_DOMCTL_CDF_hap)
+ uint32_t flags;
+};
+typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
+
+#define XEN_DOMCTL_destroydomain 2
+#define XEN_DOMCTL_pausedomain 3
+#define XEN_DOMCTL_unpausedomain 4
+#define XEN_DOMCTL_resumedomain 27
+
+#define XEN_DOMCTL_getdomaininfo 5
+struct xen_domctl_getdomaininfo {
+ /* OUT variables. */
+ domid_t domain; /* Also echoed in domctl.domain */
+ /* Domain is scheduled to die. */
+#define _XEN_DOMINF_dying 0
+#define XEN_DOMINF_dying (1U<<_XEN_DOMINF_dying)
+ /* Domain is an HVM guest (as opposed to a PV guest). */
+#define _XEN_DOMINF_hvm_guest 1
+#define XEN_DOMINF_hvm_guest (1U<<_XEN_DOMINF_hvm_guest)
+ /* The guest OS has shut down. */
+#define _XEN_DOMINF_shutdown 2
+#define XEN_DOMINF_shutdown (1U<<_XEN_DOMINF_shutdown)
+ /* Currently paused by control software. */
+#define _XEN_DOMINF_paused 3
+#define XEN_DOMINF_paused (1U<<_XEN_DOMINF_paused)
+ /* Currently blocked pending an event. */
+#define _XEN_DOMINF_blocked 4
+#define XEN_DOMINF_blocked (1U<<_XEN_DOMINF_blocked)
+ /* Domain is currently running. */
+#define _XEN_DOMINF_running 5
+#define XEN_DOMINF_running (1U<<_XEN_DOMINF_running)
+ /* Being debugged. */
+#define _XEN_DOMINF_debugged 6
+#define XEN_DOMINF_debugged (1U<<_XEN_DOMINF_debugged)
+ /* CPU to which this domain is bound. */
+#define XEN_DOMINF_cpumask 255
+#define XEN_DOMINF_cpushift 8
+ /* XEN_DOMINF_shutdown guest-supplied code. */
+#define XEN_DOMINF_shutdownmask 255
+#define XEN_DOMINF_shutdownshift 16
+ uint32_t flags; /* XEN_DOMINF_* */
+ uint64_aligned_t tot_pages;
+ uint64_aligned_t max_pages;
+ uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */
+ uint64_aligned_t cpu_time;
+ uint32_t nr_online_vcpus; /* Number of VCPUs currently online. */
+ uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */
+ uint32_t ssidref;
+ xen_domain_handle_t handle;
+};
+typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
+
+
+#define XEN_DOMCTL_getmemlist 6
+struct xen_domctl_getmemlist {
+ /* IN variables. */
+ /* Max entries to write to output buffer. */
+ uint64_aligned_t max_pfns;
+ /* Start index in guest's page list. */
+ uint64_aligned_t start_pfn;
+ XEN_GUEST_HANDLE_64(uint64) buffer;
+ /* OUT variables. */
+ uint64_aligned_t num_pfns;
+};
+typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
+
+
+#define XEN_DOMCTL_getpageframeinfo 7
+
+#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28
+#define XEN_DOMCTL_PFINFO_NOTAB (0x0U<<28)
+#define XEN_DOMCTL_PFINFO_L1TAB (0x1U<<28)
+#define XEN_DOMCTL_PFINFO_L2TAB (0x2U<<28)
+#define XEN_DOMCTL_PFINFO_L3TAB (0x3U<<28)
+#define XEN_DOMCTL_PFINFO_L4TAB (0x4U<<28)
+#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7U<<28)
+#define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31)
+#define XEN_DOMCTL_PFINFO_XTAB (0xfU<<28) /* invalid page */
+#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28)
+
+struct xen_domctl_getpageframeinfo {
+ /* IN variables. */
+ uint64_aligned_t gmfn; /* GMFN to query */
+ /* OUT variables. */
+ /* Is the page PINNED to a type? */
+ uint32_t type; /* see above type defs */
+};
+typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
+
+
+#define XEN_DOMCTL_getpageframeinfo2 8
+struct xen_domctl_getpageframeinfo2 {
+ /* IN variables. */
+ uint64_aligned_t num;
+ /* IN/OUT variables. */
+ XEN_GUEST_HANDLE_64(uint32) array;
+};
+typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
+
+
+/*
+ * Control shadow pagetables operation
+ */
+#define XEN_DOMCTL_shadow_op 10
+
+/* Disable shadow mode. */
+#define XEN_DOMCTL_SHADOW_OP_OFF 0
+
+/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE 32
+
+/* Log-dirty bitmap operations. */
+ /* Return the bitmap and clean internal copy for next round. */
+#define XEN_DOMCTL_SHADOW_OP_CLEAN 11
+ /* Return the bitmap but do not modify internal copy. */
+#define XEN_DOMCTL_SHADOW_OP_PEEK 12
+
+/* Memory allocation accessors. */
+#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION 30
+#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION 31
+
+/* Legacy enable operations. */
+ /* Equiv. to ENABLE with no mode flags. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST 1
+ /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY 2
+ /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE 3
+
+/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */
+ /*
+ * Shadow pagetables are refcounted: guest does not use explicit mmu
+ * operations nor write-protect its pagetables.
+ */
+#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT (1 << 1)
+ /*
+ * Log pages in a bitmap as they are dirtied.
+ * Used for live relocation to determine which pages must be re-sent.
+ */
+#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2)
+ /*
+ * Automatically translate GPFNs into MFNs.
+ */
+#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3)
+ /*
+ * Xen does not steal virtual address space from the guest.
+ * Requires HVM support.
+ */
+#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL (1 << 4)
+
+struct xen_domctl_shadow_op_stats {
+ uint32_t fault_count;
+ uint32_t dirty_count;
+};
+typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t);
+
+struct xen_domctl_shadow_op {
+ /* IN variables. */
+ uint32_t op; /* XEN_DOMCTL_SHADOW_OP_* */
+
+ /* OP_ENABLE */
+ uint32_t mode; /* XEN_DOMCTL_SHADOW_ENABLE_* */
+
+ /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */
+ uint32_t mb; /* Shadow memory allocation in MB */
+
+ /* OP_PEEK / OP_CLEAN */
+ XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
+ uint64_aligned_t pages; /* Size of buffer. Updated with actual size. */
+ struct xen_domctl_shadow_op_stats stats;
+};
+typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t);
+
+
+#define XEN_DOMCTL_max_mem 11
+struct xen_domctl_max_mem {
+ /* IN variables. */
+ uint64_aligned_t max_memkb;
+};
+typedef struct xen_domctl_max_mem xen_domctl_max_mem_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t);
+
+
+#define XEN_DOMCTL_setvcpucontext 12
+#define XEN_DOMCTL_getvcpucontext 13
+struct xen_domctl_vcpucontext {
+ uint32_t vcpu; /* IN */
+ XEN_GUEST_HANDLE_64(vcpu_guest_context_t) ctxt; /* IN/OUT */
+};
+typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t);
+
+
+#define XEN_DOMCTL_getvcpuinfo 14
+struct xen_domctl_getvcpuinfo {
+ /* IN variables. */
+ uint32_t vcpu;
+ /* OUT variables. */
+ uint8_t online; /* currently online (not hotplugged)? */
+ uint8_t blocked; /* blocked waiting for an event? */
+ uint8_t running; /* currently scheduled on its CPU? */
+ uint64_aligned_t cpu_time; /* total cpu time consumed (ns) */
+ uint32_t cpu; /* current mapping */
+};
+typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
+
+
+/* Get/set which physical cpus a vcpu can execute on. */
+#define XEN_DOMCTL_setvcpuaffinity 9
+#define XEN_DOMCTL_getvcpuaffinity 25
+struct xen_domctl_vcpuaffinity {
+ uint32_t vcpu; /* IN */
+ struct xenctl_cpumap cpumap; /* IN/OUT */
+};
+typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
+
+
+#define XEN_DOMCTL_max_vcpus 15
+struct xen_domctl_max_vcpus {
+ uint32_t max; /* maximum number of vcpus */
+};
+typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
+
+
+#define XEN_DOMCTL_scheduler_op 16
+/* Scheduler types. */
+#define XEN_SCHEDULER_SEDF 4
+#define XEN_SCHEDULER_CREDIT 5
+/* Set or get info? */
+#define XEN_DOMCTL_SCHEDOP_putinfo 0
+#define XEN_DOMCTL_SCHEDOP_getinfo 1
+struct xen_domctl_scheduler_op {
+ uint32_t sched_id; /* XEN_SCHEDULER_* */
+ uint32_t cmd; /* XEN_DOMCTL_SCHEDOP_* */
+ union {
+ struct xen_domctl_sched_sedf {
+ uint64_aligned_t period;
+ uint64_aligned_t slice;
+ uint64_aligned_t latency;
+ uint32_t extratime;
+ uint32_t weight;
+ } sedf;
+ struct xen_domctl_sched_credit {
+ uint16_t weight;
+ uint16_t cap;
+ } credit;
+ } u;
+};
+typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t);
+
+
+#define XEN_DOMCTL_setdomainhandle 17
+struct xen_domctl_setdomainhandle {
+ xen_domain_handle_t handle;
+};
+typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t);
+
+
+#define XEN_DOMCTL_setdebugging 18
+struct xen_domctl_setdebugging {
+ uint8_t enable;
+};
+typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t);
+
+
+#define XEN_DOMCTL_irq_permission 19
+struct xen_domctl_irq_permission {
+ uint8_t pirq;
+ uint8_t allow_access; /* flag to specify enable/disable of IRQ access */
+};
+typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t);
+
+
+#define XEN_DOMCTL_iomem_permission 20
+struct xen_domctl_iomem_permission {
+ uint64_aligned_t first_mfn;/* first page (physical page number) in range */
+ uint64_aligned_t nr_mfns; /* number of pages in range (>0) */
+ uint8_t allow_access; /* allow (!0) or deny (0) access to range? */
+};
+typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t);
+
+
+#define XEN_DOMCTL_ioport_permission 21
+struct xen_domctl_ioport_permission {
+ uint32_t first_port; /* first port int range */
+ uint32_t nr_ports; /* size of port range */
+ uint8_t allow_access; /* allow or deny access to range? */
+};
+typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t);
+
+
+#define XEN_DOMCTL_hypercall_init 22
+struct xen_domctl_hypercall_init {
+ uint64_aligned_t gmfn; /* GMFN to be initialised */
+};
+typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
+
+
+#define XEN_DOMCTL_arch_setup 23
+#define _XEN_DOMAINSETUP_hvm_guest 0
+#define XEN_DOMAINSETUP_hvm_guest (1UL<<_XEN_DOMAINSETUP_hvm_guest)
+#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save) */
+#define XEN_DOMAINSETUP_query (1UL<<_XEN_DOMAINSETUP_query)
+#define _XEN_DOMAINSETUP_sioemu_guest 2
+#define XEN_DOMAINSETUP_sioemu_guest (1UL<<_XEN_DOMAINSETUP_sioemu_guest)
+typedef struct xen_domctl_arch_setup {
+ uint64_aligned_t flags; /* XEN_DOMAINSETUP_* */
+#ifdef __ia64__
+ uint64_aligned_t bp; /* mpaddr of boot param area */
+ uint64_aligned_t maxmem; /* Highest memory address for MDT. */
+ uint64_aligned_t xsi_va; /* Xen shared_info area virtual address. */
+ uint32_t hypercall_imm; /* Break imm for Xen hypercalls. */
+ int8_t vhpt_size_log2; /* Log2 of VHPT size. */
+#endif
+} xen_domctl_arch_setup_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
+
+
+#define XEN_DOMCTL_settimeoffset 24
+struct xen_domctl_settimeoffset {
+ int32_t time_offset_seconds; /* applied to domain wallclock time */
+};
+typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
+
+
+#define XEN_DOMCTL_gethvmcontext 33
+#define XEN_DOMCTL_sethvmcontext 34
+typedef struct xen_domctl_hvmcontext {
+ uint32_t size; /* IN/OUT: size of buffer / bytes filled */
+ XEN_GUEST_HANDLE_64(uint8) buffer; /* IN/OUT: data, or call
+ * gethvmcontext with NULL
+ * buffer to get size req'd */
+} xen_domctl_hvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t);
+
+
+#define XEN_DOMCTL_set_address_size 35
+#define XEN_DOMCTL_get_address_size 36
+typedef struct xen_domctl_address_size {
+ uint32_t size;
+} xen_domctl_address_size_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t);
+
+
+#define XEN_DOMCTL_real_mode_area 26
+struct xen_domctl_real_mode_area {
+ uint32_t log; /* log2 of Real Mode Area size */
+};
+typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
+
+
+#define XEN_DOMCTL_sendtrigger 28
+#define XEN_DOMCTL_SENDTRIGGER_NMI 0
+#define XEN_DOMCTL_SENDTRIGGER_RESET 1
+#define XEN_DOMCTL_SENDTRIGGER_INIT 2
+struct xen_domctl_sendtrigger {
+ uint32_t trigger; /* IN */
+ uint32_t vcpu; /* IN */
+};
+typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t);
+
+
+/* Assign PCI device to HVM guest. Sets up IOMMU structures. */
+#define XEN_DOMCTL_assign_device 37
+#define XEN_DOMCTL_test_assign_device 45
+#define XEN_DOMCTL_deassign_device 47
+struct xen_domctl_assign_device {
+ uint32_t machine_bdf; /* machine PCI ID of assigned device */
+};
+typedef struct xen_domctl_assign_device xen_domctl_assign_device_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t);
+
+
+/* Pass-through interrupts: bind real irq -> hvm devfn. */
+#define XEN_DOMCTL_bind_pt_irq 38
+#define XEN_DOMCTL_unbind_pt_irq 48
+typedef enum pt_irq_type_e {
+ PT_IRQ_TYPE_PCI,
+ PT_IRQ_TYPE_ISA
+} pt_irq_type_t;
+struct xen_domctl_bind_pt_irq {
+ uint32_t machine_irq;
+ pt_irq_type_t irq_type;
+ uint32_t hvm_domid;
+
+ union {
+ struct {
+ uint8_t isa_irq;
+ } isa;
+ struct {
+ uint8_t bus;
+ uint8_t device;
+ uint8_t intx;
+ } pci;
+ } u;
+};
+typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t);
+
+
+/* Bind machine I/O address range -> HVM address range. */
+#define XEN_DOMCTL_memory_mapping 39
+#define DPCI_ADD_MAPPING 1
+#define DPCI_REMOVE_MAPPING 0
+struct xen_domctl_memory_mapping {
+ uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */
+ uint64_aligned_t first_mfn; /* first page (machine page) in range */
+ uint64_aligned_t nr_mfns; /* number of pages in range (>0) */
+ uint32_t add_mapping; /* add or remove mapping */
+ uint32_t padding; /* padding for 64-bit aligned structure */
+};
+typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t);
+
+
+/* Bind machine I/O port range -> HVM I/O port range. */
+#define XEN_DOMCTL_ioport_mapping 40
+struct xen_domctl_ioport_mapping {
+ uint32_t first_gport; /* first guest IO port*/
+ uint32_t first_mport; /* first machine IO port */
+ uint32_t nr_ports; /* size of port range */
+ uint32_t add_mapping; /* add or remove mapping */
+};
+typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t);
+
+
+/*
+ * Pin caching type of RAM space for x86 HVM domU.
+ */
+#define XEN_DOMCTL_pin_mem_cacheattr 41
+/* Caching types: these happen to be the same as x86 MTRR/PAT type codes. */
+#define XEN_DOMCTL_MEM_CACHEATTR_UC 0
+#define XEN_DOMCTL_MEM_CACHEATTR_WC 1
+#define XEN_DOMCTL_MEM_CACHEATTR_WT 4
+#define XEN_DOMCTL_MEM_CACHEATTR_WP 5
+#define XEN_DOMCTL_MEM_CACHEATTR_WB 6
+#define XEN_DOMCTL_MEM_CACHEATTR_UCM 7
+struct xen_domctl_pin_mem_cacheattr {
+ uint64_aligned_t start, end;
+ unsigned int type; /* XEN_DOMCTL_MEM_CACHEATTR_* */
+};
+typedef struct xen_domctl_pin_mem_cacheattr xen_domctl_pin_mem_cacheattr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
+
+
+#define XEN_DOMCTL_set_ext_vcpucontext 42
+#define XEN_DOMCTL_get_ext_vcpucontext 43
+struct xen_domctl_ext_vcpucontext {
+ /* IN: VCPU that this call applies to. */
+ uint32_t vcpu;
+ /*
+ * SET: Size of struct (IN)
+ * GET: Size of struct (OUT)
+ */
+ uint32_t size;
+#if defined(__i386__) || defined(__x86_64__)
+ /* SYSCALL from 32-bit mode and SYSENTER callback information. */
+ /* NB. SYSCALL from 64-bit mode is contained in vcpu_guest_context_t */
+ uint64_aligned_t syscall32_callback_eip;
+ uint64_aligned_t sysenter_callback_eip;
+ uint16_t syscall32_callback_cs;
+ uint16_t sysenter_callback_cs;
+ uint8_t syscall32_disables_events;
+ uint8_t sysenter_disables_events;
+#endif
+};
+typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t);
+
+/*
+ * Set optimizaton features for a domain
+ */
+#define XEN_DOMCTL_set_opt_feature 44
+struct xen_domctl_set_opt_feature {
+#if defined(__ia64__)
+ struct xen_ia64_opt_feature optf;
+#else
+ /* Make struct non-empty: do not depend on this field name! */
+ uint64_t dummy;
+#endif
+};
+typedef struct xen_domctl_set_opt_feature xen_domctl_set_opt_feature_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_opt_feature_t);
+
+/*
+ * Set the target domain for a domain
+ */
+#define XEN_DOMCTL_set_target 46
+struct xen_domctl_set_target {
+ domid_t target;
+};
+typedef struct xen_domctl_set_target xen_domctl_set_target_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_target_t);
+
+
+struct xen_domctl {
+ uint32_t cmd;
+ uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
+ domid_t domain;
+ union {
+ struct xen_domctl_createdomain createdomain;
+ struct xen_domctl_getdomaininfo getdomaininfo;
+ struct xen_domctl_getmemlist getmemlist;
+ struct xen_domctl_getpageframeinfo getpageframeinfo;
+ struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
+ struct xen_domctl_vcpuaffinity vcpuaffinity;
+ struct xen_domctl_shadow_op shadow_op;
+ struct xen_domctl_max_mem max_mem;
+ struct xen_domctl_vcpucontext vcpucontext;
+ struct xen_domctl_getvcpuinfo getvcpuinfo;
+ struct xen_domctl_max_vcpus max_vcpus;
+ struct xen_domctl_scheduler_op scheduler_op;
+ struct xen_domctl_setdomainhandle setdomainhandle;
+ struct xen_domctl_setdebugging setdebugging;
+ struct xen_domctl_irq_permission irq_permission;
+ struct xen_domctl_iomem_permission iomem_permission;
+ struct xen_domctl_ioport_permission ioport_permission;
+ struct xen_domctl_hypercall_init hypercall_init;
+ struct xen_domctl_arch_setup arch_setup;
+ struct xen_domctl_settimeoffset settimeoffset;
+ struct xen_domctl_real_mode_area real_mode_area;
+ struct xen_domctl_hvmcontext hvmcontext;
+ struct xen_domctl_address_size address_size;
+ struct xen_domctl_sendtrigger sendtrigger;
+ struct xen_domctl_assign_device assign_device;
+ struct xen_domctl_bind_pt_irq bind_pt_irq;
+ struct xen_domctl_memory_mapping memory_mapping;
+ struct xen_domctl_ioport_mapping ioport_mapping;
+ struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr;
+ struct xen_domctl_ext_vcpucontext ext_vcpucontext;
+ struct xen_domctl_set_opt_feature set_opt_feature;
+ struct xen_domctl_set_target set_target;
+ uint8_t pad[128];
+ } u;
+};
+typedef struct xen_domctl xen_domctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_t);
+
+#endif /* __XEN_PUBLIC_DOMCTL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Definitions used for the Xen ELF notes.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2006, Ian Campbell, XenSource Ltd.
*/
#define __XEN_PUBLIC_ELFNOTE_H__
/*
- * The notes should live in a SHT_NOTE segment and have "Xen" in the
+ * The notes should live in a PT_NOTE segment and have "Xen" in the
* name field.
*
* Numeric types are either 4 or 8 bytes depending on the content of
/*
* NAME=VALUE pair (string).
- *
- * LEGACY: FEATURES and PAE
*/
#define XEN_ELFNOTE_INFO 0
#define XEN_ELFNOTE_LOADER 8
/*
- * The kernel supports PAE (x86/32 only, string = "yes" or "no").
+ * The kernel supports PAE (x86/32 only, string = "yes", "no" or
+ * "bimodal").
+ *
+ * For compatibility with Xen 3.0.3 and earlier the "bimodal" setting
+ * may be given as "yes,bimodal" which will cause older Xen to treat
+ * this kernel as PAE.
*
* LEGACY: PAE (n.b. The legacy interface included a provision to
* indicate 'extended-cr3' support allowing L3 page tables to be
*/
#define XEN_ELFNOTE_BSD_SYMTAB 11
+/*
+ * The lowest address the hypervisor hole can begin at (numeric).
+ *
+ * This must not be set higher than HYPERVISOR_VIRT_START. Its presence
+ * also indicates to the hypervisor that the kernel can deal with the
+ * hole starting at a higher address.
+ */
+#define XEN_ELFNOTE_HV_START_LOW 12
+
+/*
+ * List of maddr_t-sized mask/value pairs describing how to recognize
+ * (non-present) L1 page table entries carrying valid MFNs (numeric).
+ */
+#define XEN_ELFNOTE_L1_MFN_VALID 13
+
+/*
+ * Whether or not the guest supports cooperative suspend cancellation.
+ */
+#define XEN_ELFNOTE_SUSPEND_CANCEL 14
+
+/*
+ * The number of the highest elfnote defined.
+ */
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
+
+/*
+ * System information exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO
+ * note in case of a system crash. This note will contain various
+ * information about the system, see xen/include/xen/elfcore.h.
+ */
+#define XEN_ELFNOTE_CRASH_INFO 0x1000001
+
+/*
+ * System registers exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS
+ * note per cpu in case of a system crash. This note is architecture
+ * specific and will contain registers not saved in the "CORE" note.
+ * See xen/include/xen/elfcore.h for more information.
+ */
+#define XEN_ELFNOTE_CRASH_REGS 0x1000002
+
+
+/*
+ * xen dump-core none note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_NONE
+ * in its dump file to indicate that the file is xen dump-core
+ * file. This note doesn't have any other information.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_NONE 0x2000000
+
+/*
+ * xen dump-core header note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_HEADER
+ * in its dump file.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_HEADER 0x2000001
+
+/*
+ * xen dump-core xen version note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_XEN_VERSION
+ * in its dump file. It contains the xen version obtained via the
+ * XENVER hypercall.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_XEN_VERSION 0x2000002
+
+/*
+ * xen dump-core format version note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION
+ * in its dump file. It contains a format version identifier.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION 0x2000003
+
#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
/*
--- /dev/null
+#ifndef __XEN_PUBLIC_ELFSTRUCTS_H__
+#define __XEN_PUBLIC_ELFSTRUCTS_H__ 1
+/*
+ * Copyright (c) 1995, 1996 Erik Theisen. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+typedef uint8_t Elf_Byte;
+
+typedef uint32_t Elf32_Addr; /* Unsigned program address */
+typedef uint32_t Elf32_Off; /* Unsigned file offset */
+typedef int32_t Elf32_Sword; /* Signed large integer */
+typedef uint32_t Elf32_Word; /* Unsigned large integer */
+typedef uint16_t Elf32_Half; /* Unsigned medium integer */
+
+typedef uint64_t Elf64_Addr;
+typedef uint64_t Elf64_Off;
+typedef int32_t Elf64_Shalf;
+
+typedef int32_t Elf64_Sword;
+typedef uint32_t Elf64_Word;
+
+typedef int64_t Elf64_Sxword;
+typedef uint64_t Elf64_Xword;
+
+typedef uint32_t Elf64_Half;
+typedef uint16_t Elf64_Quarter;
+
+/*
+ * e_ident[] identification indexes
+ * See http://www.caldera.com/developers/gabi/2000-07-17/ch4.eheader.html
+ */
+#define EI_MAG0 0 /* file ID */
+#define EI_MAG1 1 /* file ID */
+#define EI_MAG2 2 /* file ID */
+#define EI_MAG3 3 /* file ID */
+#define EI_CLASS 4 /* file class */
+#define EI_DATA 5 /* data encoding */
+#define EI_VERSION 6 /* ELF header version */
+#define EI_OSABI 7 /* OS/ABI ID */
+#define EI_ABIVERSION 8 /* ABI version */
+#define EI_PAD 9 /* start of pad bytes */
+#define EI_NIDENT 16 /* Size of e_ident[] */
+
+/* e_ident[] magic number */
+#define ELFMAG0 0x7f /* e_ident[EI_MAG0] */
+#define ELFMAG1 'E' /* e_ident[EI_MAG1] */
+#define ELFMAG2 'L' /* e_ident[EI_MAG2] */
+#define ELFMAG3 'F' /* e_ident[EI_MAG3] */
+#define ELFMAG "\177ELF" /* magic */
+#define SELFMAG 4 /* size of magic */
+
+/* e_ident[] file class */
+#define ELFCLASSNONE 0 /* invalid */
+#define ELFCLASS32 1 /* 32-bit objs */
+#define ELFCLASS64 2 /* 64-bit objs */
+#define ELFCLASSNUM 3 /* number of classes */
+
+/* e_ident[] data encoding */
+#define ELFDATANONE 0 /* invalid */
+#define ELFDATA2LSB 1 /* Little-Endian */
+#define ELFDATA2MSB 2 /* Big-Endian */
+#define ELFDATANUM 3 /* number of data encode defines */
+
+/* e_ident[] Operating System/ABI */
+#define ELFOSABI_SYSV 0 /* UNIX System V ABI */
+#define ELFOSABI_HPUX 1 /* HP-UX operating system */
+#define ELFOSABI_NETBSD 2 /* NetBSD */
+#define ELFOSABI_LINUX 3 /* GNU/Linux */
+#define ELFOSABI_HURD 4 /* GNU/Hurd */
+#define ELFOSABI_86OPEN 5 /* 86Open common IA32 ABI */
+#define ELFOSABI_SOLARIS 6 /* Solaris */
+#define ELFOSABI_MONTEREY 7 /* Monterey */
+#define ELFOSABI_IRIX 8 /* IRIX */
+#define ELFOSABI_FREEBSD 9 /* FreeBSD */
+#define ELFOSABI_TRU64 10 /* TRU64 UNIX */
+#define ELFOSABI_MODESTO 11 /* Novell Modesto */
+#define ELFOSABI_OPENBSD 12 /* OpenBSD */
+#define ELFOSABI_ARM 97 /* ARM */
+#define ELFOSABI_STANDALONE 255 /* Standalone (embedded) application */
+
+/* e_ident */
+#define IS_ELF(ehdr) ((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \
+ (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \
+ (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \
+ (ehdr).e_ident[EI_MAG3] == ELFMAG3)
+
+/* ELF Header */
+typedef struct elfhdr {
+ unsigned char e_ident[EI_NIDENT]; /* ELF Identification */
+ Elf32_Half e_type; /* object file type */
+ Elf32_Half e_machine; /* machine */
+ Elf32_Word e_version; /* object file version */
+ Elf32_Addr e_entry; /* virtual entry point */
+ Elf32_Off e_phoff; /* program header table offset */
+ Elf32_Off e_shoff; /* section header table offset */
+ Elf32_Word e_flags; /* processor-specific flags */
+ Elf32_Half e_ehsize; /* ELF header size */
+ Elf32_Half e_phentsize; /* program header entry size */
+ Elf32_Half e_phnum; /* number of program header entries */
+ Elf32_Half e_shentsize; /* section header entry size */
+ Elf32_Half e_shnum; /* number of section header entries */
+ Elf32_Half e_shstrndx; /* section header table's "section
+ header string table" entry offset */
+} Elf32_Ehdr;
+
+typedef struct {
+ unsigned char e_ident[EI_NIDENT]; /* Id bytes */
+ Elf64_Quarter e_type; /* file type */
+ Elf64_Quarter e_machine; /* machine type */
+ Elf64_Half e_version; /* version number */
+ Elf64_Addr e_entry; /* entry point */
+ Elf64_Off e_phoff; /* Program hdr offset */
+ Elf64_Off e_shoff; /* Section hdr offset */
+ Elf64_Half e_flags; /* Processor flags */
+ Elf64_Quarter e_ehsize; /* sizeof ehdr */
+ Elf64_Quarter e_phentsize; /* Program header entry size */
+ Elf64_Quarter e_phnum; /* Number of program headers */
+ Elf64_Quarter e_shentsize; /* Section header entry size */
+ Elf64_Quarter e_shnum; /* Number of section headers */
+ Elf64_Quarter e_shstrndx; /* String table index */
+} Elf64_Ehdr;
+
+/* e_type */
+#define ET_NONE 0 /* No file type */
+#define ET_REL 1 /* relocatable file */
+#define ET_EXEC 2 /* executable file */
+#define ET_DYN 3 /* shared object file */
+#define ET_CORE 4 /* core file */
+#define ET_NUM 5 /* number of types */
+#define ET_LOPROC 0xff00 /* reserved range for processor */
+#define ET_HIPROC 0xffff /* specific e_type */
+
+/* e_machine */
+#define EM_NONE 0 /* No Machine */
+#define EM_M32 1 /* AT&T WE 32100 */
+#define EM_SPARC 2 /* SPARC */
+#define EM_386 3 /* Intel 80386 */
+#define EM_68K 4 /* Motorola 68000 */
+#define EM_88K 5 /* Motorola 88000 */
+#define EM_486 6 /* Intel 80486 - unused? */
+#define EM_860 7 /* Intel 80860 */
+#define EM_MIPS 8 /* MIPS R3000 Big-Endian only */
+/*
+ * Don't know if EM_MIPS_RS4_BE,
+ * EM_SPARC64, EM_PARISC,
+ * or EM_PPC are ABI compliant
+ */
+#define EM_MIPS_RS4_BE 10 /* MIPS R4000 Big-Endian */
+#define EM_SPARC64 11 /* SPARC v9 64-bit unoffical */
+#define EM_PARISC 15 /* HPPA */
+#define EM_SPARC32PLUS 18 /* Enhanced instruction set SPARC */
+#define EM_PPC 20 /* PowerPC */
+#define EM_PPC64 21 /* PowerPC 64-bit */
+#define EM_ARM 40 /* Advanced RISC Machines ARM */
+#define EM_ALPHA 41 /* DEC ALPHA */
+#define EM_SPARCV9 43 /* SPARC version 9 */
+#define EM_ALPHA_EXP 0x9026 /* DEC ALPHA */
+#define EM_IA_64 50 /* Intel Merced */
+#define EM_X86_64 62 /* AMD x86-64 architecture */
+#define EM_VAX 75 /* DEC VAX */
+
+/* Version */
+#define EV_NONE 0 /* Invalid */
+#define EV_CURRENT 1 /* Current */
+#define EV_NUM 2 /* number of versions */
+
+/* Section Header */
+typedef struct {
+ Elf32_Word sh_name; /* name - index into section header
+ string table section */
+ Elf32_Word sh_type; /* type */
+ Elf32_Word sh_flags; /* flags */
+ Elf32_Addr sh_addr; /* address */
+ Elf32_Off sh_offset; /* file offset */
+ Elf32_Word sh_size; /* section size */
+ Elf32_Word sh_link; /* section header table index link */
+ Elf32_Word sh_info; /* extra information */
+ Elf32_Word sh_addralign; /* address alignment */
+ Elf32_Word sh_entsize; /* section entry size */
+} Elf32_Shdr;
+
+typedef struct {
+ Elf64_Half sh_name; /* section name */
+ Elf64_Half sh_type; /* section type */
+ Elf64_Xword sh_flags; /* section flags */
+ Elf64_Addr sh_addr; /* virtual address */
+ Elf64_Off sh_offset; /* file offset */
+ Elf64_Xword sh_size; /* section size */
+ Elf64_Half sh_link; /* link to another */
+ Elf64_Half sh_info; /* misc info */
+ Elf64_Xword sh_addralign; /* memory alignment */
+ Elf64_Xword sh_entsize; /* table entry size */
+} Elf64_Shdr;
+
+/* Special Section Indexes */
+#define SHN_UNDEF 0 /* undefined */
+#define SHN_LORESERVE 0xff00 /* lower bounds of reserved indexes */
+#define SHN_LOPROC 0xff00 /* reserved range for processor */
+#define SHN_HIPROC 0xff1f /* specific section indexes */
+#define SHN_ABS 0xfff1 /* absolute value */
+#define SHN_COMMON 0xfff2 /* common symbol */
+#define SHN_HIRESERVE 0xffff /* upper bounds of reserved indexes */
+
+/* sh_type */
+#define SHT_NULL 0 /* inactive */
+#define SHT_PROGBITS 1 /* program defined information */
+#define SHT_SYMTAB 2 /* symbol table section */
+#define SHT_STRTAB 3 /* string table section */
+#define SHT_RELA 4 /* relocation section with addends*/
+#define SHT_HASH 5 /* symbol hash table section */
+#define SHT_DYNAMIC 6 /* dynamic section */
+#define SHT_NOTE 7 /* note section */
+#define SHT_NOBITS 8 /* no space section */
+#define SHT_REL 9 /* relation section without addends */
+#define SHT_SHLIB 10 /* reserved - purpose unknown */
+#define SHT_DYNSYM 11 /* dynamic symbol table section */
+#define SHT_NUM 12 /* number of section types */
+#define SHT_LOPROC 0x70000000 /* reserved range for processor */
+#define SHT_HIPROC 0x7fffffff /* specific section header types */
+#define SHT_LOUSER 0x80000000 /* reserved range for application */
+#define SHT_HIUSER 0xffffffff /* specific indexes */
+
+/* Section names */
+#define ELF_BSS ".bss" /* uninitialized data */
+#define ELF_DATA ".data" /* initialized data */
+#define ELF_DEBUG ".debug" /* debug */
+#define ELF_DYNAMIC ".dynamic" /* dynamic linking information */
+#define ELF_DYNSTR ".dynstr" /* dynamic string table */
+#define ELF_DYNSYM ".dynsym" /* dynamic symbol table */
+#define ELF_FINI ".fini" /* termination code */
+#define ELF_GOT ".got" /* global offset table */
+#define ELF_HASH ".hash" /* symbol hash table */
+#define ELF_INIT ".init" /* initialization code */
+#define ELF_REL_DATA ".rel.data" /* relocation data */
+#define ELF_REL_FINI ".rel.fini" /* relocation termination code */
+#define ELF_REL_INIT ".rel.init" /* relocation initialization code */
+#define ELF_REL_DYN ".rel.dyn" /* relocaltion dynamic link info */
+#define ELF_REL_RODATA ".rel.rodata" /* relocation read-only data */
+#define ELF_REL_TEXT ".rel.text" /* relocation code */
+#define ELF_RODATA ".rodata" /* read-only data */
+#define ELF_SHSTRTAB ".shstrtab" /* section header string table */
+#define ELF_STRTAB ".strtab" /* string table */
+#define ELF_SYMTAB ".symtab" /* symbol table */
+#define ELF_TEXT ".text" /* code */
+
+
+/* Section Attribute Flags - sh_flags */
+#define SHF_WRITE 0x1 /* Writable */
+#define SHF_ALLOC 0x2 /* occupies memory */
+#define SHF_EXECINSTR 0x4 /* executable */
+#define SHF_MASKPROC 0xf0000000 /* reserved bits for processor */
+ /* specific section attributes */
+
+/* Symbol Table Entry */
+typedef struct elf32_sym {
+ Elf32_Word st_name; /* name - index into string table */
+ Elf32_Addr st_value; /* symbol value */
+ Elf32_Word st_size; /* symbol size */
+ unsigned char st_info; /* type and binding */
+ unsigned char st_other; /* 0 - no defined meaning */
+ Elf32_Half st_shndx; /* section header index */
+} Elf32_Sym;
+
+typedef struct {
+ Elf64_Half st_name; /* Symbol name index in str table */
+ Elf_Byte st_info; /* type / binding attrs */
+ Elf_Byte st_other; /* unused */
+ Elf64_Quarter st_shndx; /* section index of symbol */
+ Elf64_Xword st_value; /* value of symbol */
+ Elf64_Xword st_size; /* size of symbol */
+} Elf64_Sym;
+
+/* Symbol table index */
+#define STN_UNDEF 0 /* undefined */
+
+/* Extract symbol info - st_info */
+#define ELF32_ST_BIND(x) ((x) >> 4)
+#define ELF32_ST_TYPE(x) (((unsigned int) x) & 0xf)
+#define ELF32_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf))
+
+#define ELF64_ST_BIND(x) ((x) >> 4)
+#define ELF64_ST_TYPE(x) (((unsigned int) x) & 0xf)
+#define ELF64_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf))
+
+/* Symbol Binding - ELF32_ST_BIND - st_info */
+#define STB_LOCAL 0 /* Local symbol */
+#define STB_GLOBAL 1 /* Global symbol */
+#define STB_WEAK 2 /* like global - lower precedence */
+#define STB_NUM 3 /* number of symbol bindings */
+#define STB_LOPROC 13 /* reserved range for processor */
+#define STB_HIPROC 15 /* specific symbol bindings */
+
+/* Symbol type - ELF32_ST_TYPE - st_info */
+#define STT_NOTYPE 0 /* not specified */
+#define STT_OBJECT 1 /* data object */
+#define STT_FUNC 2 /* function */
+#define STT_SECTION 3 /* section */
+#define STT_FILE 4 /* file */
+#define STT_NUM 5 /* number of symbol types */
+#define STT_LOPROC 13 /* reserved range for processor */
+#define STT_HIPROC 15 /* specific symbol types */
+
+/* Relocation entry with implicit addend */
+typedef struct {
+ Elf32_Addr r_offset; /* offset of relocation */
+ Elf32_Word r_info; /* symbol table index and type */
+} Elf32_Rel;
+
+/* Relocation entry with explicit addend */
+typedef struct {
+ Elf32_Addr r_offset; /* offset of relocation */
+ Elf32_Word r_info; /* symbol table index and type */
+ Elf32_Sword r_addend;
+} Elf32_Rela;
+
+/* Extract relocation info - r_info */
+#define ELF32_R_SYM(i) ((i) >> 8)
+#define ELF32_R_TYPE(i) ((unsigned char) (i))
+#define ELF32_R_INFO(s,t) (((s) << 8) + (unsigned char)(t))
+
+typedef struct {
+ Elf64_Xword r_offset; /* where to do it */
+ Elf64_Xword r_info; /* index & type of relocation */
+} Elf64_Rel;
+
+typedef struct {
+ Elf64_Xword r_offset; /* where to do it */
+ Elf64_Xword r_info; /* index & type of relocation */
+ Elf64_Sxword r_addend; /* adjustment value */
+} Elf64_Rela;
+
+#define ELF64_R_SYM(info) ((info) >> 32)
+#define ELF64_R_TYPE(info) ((info) & 0xFFFFFFFF)
+#define ELF64_R_INFO(s,t) (((s) << 32) + (u_int32_t)(t))
+
+/* Program Header */
+typedef struct {
+ Elf32_Word p_type; /* segment type */
+ Elf32_Off p_offset; /* segment offset */
+ Elf32_Addr p_vaddr; /* virtual address of segment */
+ Elf32_Addr p_paddr; /* physical address - ignored? */
+ Elf32_Word p_filesz; /* number of bytes in file for seg. */
+ Elf32_Word p_memsz; /* number of bytes in mem. for seg. */
+ Elf32_Word p_flags; /* flags */
+ Elf32_Word p_align; /* memory alignment */
+} Elf32_Phdr;
+
+typedef struct {
+ Elf64_Half p_type; /* entry type */
+ Elf64_Half p_flags; /* flags */
+ Elf64_Off p_offset; /* offset */
+ Elf64_Addr p_vaddr; /* virtual address */
+ Elf64_Addr p_paddr; /* physical address */
+ Elf64_Xword p_filesz; /* file size */
+ Elf64_Xword p_memsz; /* memory size */
+ Elf64_Xword p_align; /* memory & file alignment */
+} Elf64_Phdr;
+
+/* Segment types - p_type */
+#define PT_NULL 0 /* unused */
+#define PT_LOAD 1 /* loadable segment */
+#define PT_DYNAMIC 2 /* dynamic linking section */
+#define PT_INTERP 3 /* the RTLD */
+#define PT_NOTE 4 /* auxiliary information */
+#define PT_SHLIB 5 /* reserved - purpose undefined */
+#define PT_PHDR 6 /* program header */
+#define PT_NUM 7 /* Number of segment types */
+#define PT_LOPROC 0x70000000 /* reserved range for processor */
+#define PT_HIPROC 0x7fffffff /* specific segment types */
+
+/* Segment flags - p_flags */
+#define PF_X 0x1 /* Executable */
+#define PF_W 0x2 /* Writable */
+#define PF_R 0x4 /* Readable */
+#define PF_MASKPROC 0xf0000000 /* reserved bits for processor */
+ /* specific segment flags */
+
+/* Dynamic structure */
+typedef struct {
+ Elf32_Sword d_tag; /* controls meaning of d_val */
+ union {
+ Elf32_Word d_val; /* Multiple meanings - see d_tag */
+ Elf32_Addr d_ptr; /* program virtual address */
+ } d_un;
+} Elf32_Dyn;
+
+typedef struct {
+ Elf64_Xword d_tag; /* controls meaning of d_val */
+ union {
+ Elf64_Addr d_ptr;
+ Elf64_Xword d_val;
+ } d_un;
+} Elf64_Dyn;
+
+/* Dynamic Array Tags - d_tag */
+#define DT_NULL 0 /* marks end of _DYNAMIC array */
+#define DT_NEEDED 1 /* string table offset of needed lib */
+#define DT_PLTRELSZ 2 /* size of relocation entries in PLT */
+#define DT_PLTGOT 3 /* address PLT/GOT */
+#define DT_HASH 4 /* address of symbol hash table */
+#define DT_STRTAB 5 /* address of string table */
+#define DT_SYMTAB 6 /* address of symbol table */
+#define DT_RELA 7 /* address of relocation table */
+#define DT_RELASZ 8 /* size of relocation table */
+#define DT_RELAENT 9 /* size of relocation entry */
+#define DT_STRSZ 10 /* size of string table */
+#define DT_SYMENT 11 /* size of symbol table entry */
+#define DT_INIT 12 /* address of initialization func. */
+#define DT_FINI 13 /* address of termination function */
+#define DT_SONAME 14 /* string table offset of shared obj */
+#define DT_RPATH 15 /* string table offset of library
+ search path */
+#define DT_SYMBOLIC 16 /* start sym search in shared obj. */
+#define DT_REL 17 /* address of rel. tbl. w addends */
+#define DT_RELSZ 18 /* size of DT_REL relocation table */
+#define DT_RELENT 19 /* size of DT_REL relocation entry */
+#define DT_PLTREL 20 /* PLT referenced relocation entry */
+#define DT_DEBUG 21 /* bugger */
+#define DT_TEXTREL 22 /* Allow rel. mod. to unwritable seg */
+#define DT_JMPREL 23 /* add. of PLT's relocation entries */
+#define DT_BIND_NOW 24 /* Bind now regardless of env setting */
+#define DT_NUM 25 /* Number used. */
+#define DT_LOPROC 0x70000000 /* reserved range for processor */
+#define DT_HIPROC 0x7fffffff /* specific dynamic array tags */
+
+/* Standard ELF hashing function */
+unsigned int elf_hash(const unsigned char *name);
+
+/*
+ * Note Definitions
+ */
+typedef struct {
+ Elf32_Word namesz;
+ Elf32_Word descsz;
+ Elf32_Word type;
+} Elf32_Note;
+
+typedef struct {
+ Elf64_Half namesz;
+ Elf64_Half descsz;
+ Elf64_Half type;
+} Elf64_Note;
+
+
+#if defined(ELFSIZE)
+#define CONCAT(x,y) __CONCAT(x,y)
+#define ELFNAME(x) CONCAT(elf,CONCAT(ELFSIZE,CONCAT(_,x)))
+#define ELFNAME2(x,y) CONCAT(x,CONCAT(_elf,CONCAT(ELFSIZE,CONCAT(_,y))))
+#define ELFNAMEEND(x) CONCAT(x,CONCAT(_elf,ELFSIZE))
+#define ELFDEFNNAME(x) CONCAT(ELF,CONCAT(ELFSIZE,CONCAT(_,x)))
+#endif
+
+#if defined(ELFSIZE) && (ELFSIZE == 32)
+#define Elf_Ehdr Elf32_Ehdr
+#define Elf_Phdr Elf32_Phdr
+#define Elf_Shdr Elf32_Shdr
+#define Elf_Sym Elf32_Sym
+#define Elf_Rel Elf32_Rel
+#define Elf_RelA Elf32_Rela
+#define Elf_Dyn Elf32_Dyn
+#define Elf_Word Elf32_Word
+#define Elf_Sword Elf32_Sword
+#define Elf_Addr Elf32_Addr
+#define Elf_Off Elf32_Off
+#define Elf_Nhdr Elf32_Nhdr
+#define Elf_Note Elf32_Note
+
+#define ELF_R_SYM ELF32_R_SYM
+#define ELF_R_TYPE ELF32_R_TYPE
+#define ELF_R_INFO ELF32_R_INFO
+#define ELFCLASS ELFCLASS32
+
+#define ELF_ST_BIND ELF32_ST_BIND
+#define ELF_ST_TYPE ELF32_ST_TYPE
+#define ELF_ST_INFO ELF32_ST_INFO
+
+#define AuxInfo Aux32Info
+#elif defined(ELFSIZE) && (ELFSIZE == 64)
+#define Elf_Ehdr Elf64_Ehdr
+#define Elf_Phdr Elf64_Phdr
+#define Elf_Shdr Elf64_Shdr
+#define Elf_Sym Elf64_Sym
+#define Elf_Rel Elf64_Rel
+#define Elf_RelA Elf64_Rela
+#define Elf_Dyn Elf64_Dyn
+#define Elf_Word Elf64_Word
+#define Elf_Sword Elf64_Sword
+#define Elf_Addr Elf64_Addr
+#define Elf_Off Elf64_Off
+#define Elf_Nhdr Elf64_Nhdr
+#define Elf_Note Elf64_Note
+
+#define ELF_R_SYM ELF64_R_SYM
+#define ELF_R_TYPE ELF64_R_TYPE
+#define ELF_R_INFO ELF64_R_INFO
+#define ELFCLASS ELFCLASS64
+
+#define ELF_ST_BIND ELF64_ST_BIND
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_INFO ELF64_ST_INFO
+
+#define AuxInfo Aux64Info
+#endif
+
+#endif /* __XEN_PUBLIC_ELFSTRUCTS_H__ */
*
* Event channels between domains.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2003-2004, K A Fraser.
*/
#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
#define __XEN_PUBLIC_EVENT_CHANNEL_H__
+/*
+ * Prototype for this hypercall is:
+ * int event_channel_op(int cmd, void *args)
+ * @cmd == EVTCHNOP_??? (event-channel operation).
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
typedef uint32_t evtchn_port_t;
-DEFINE_GUEST_HANDLE(evtchn_port_t);
+DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
/*
* EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
* 1. If the caller is unprivileged then <dom> must be DOMID_SELF.
* 2. <rdom> may be DOMID_SELF, allowing loopback connections.
*/
-#define EVTCHNOP_alloc_unbound 6
+#define EVTCHNOP_alloc_unbound 6
struct evtchn_alloc_unbound {
- /* IN parameters */
- domid_t dom, remote_dom;
- /* OUT parameters */
- evtchn_port_t port;
+ /* IN parameters */
+ domid_t dom, remote_dom;
+ /* OUT parameters */
+ evtchn_port_t port;
};
+typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
/*
* EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
*/
#define EVTCHNOP_bind_interdomain 0
struct evtchn_bind_interdomain {
- /* IN parameters. */
- domid_t remote_dom;
- evtchn_port_t remote_port;
- /* OUT parameters. */
- evtchn_port_t local_port;
+ /* IN parameters. */
+ domid_t remote_dom;
+ evtchn_port_t remote_port;
+ /* OUT parameters. */
+ evtchn_port_t local_port;
};
+typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
/*
* EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
* vcpu.
* NOTES:
- * 1. A virtual IRQ may be bound to at most one event channel per vcpu.
- * 2. The allocated event channel is bound to the specified vcpu. The binding
- * may not be changed.
+ * 1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
+ * in xen.h for the classification of each VIRQ.
+ * 2. Global VIRQs must be allocated on VCPU0 but can subsequently be
+ * re-bound via EVTCHNOP_bind_vcpu.
+ * 3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
+ * The allocated event channel is bound to the specified vcpu and the
+ * binding cannot be changed.
*/
-#define EVTCHNOP_bind_virq 1
+#define EVTCHNOP_bind_virq 1
struct evtchn_bind_virq {
- /* IN parameters. */
- uint32_t virq;
- uint32_t vcpu;
- /* OUT parameters. */
- evtchn_port_t port;
+ /* IN parameters. */
+ uint32_t virq;
+ uint32_t vcpu;
+ /* OUT parameters. */
+ evtchn_port_t port;
};
+typedef struct evtchn_bind_virq evtchn_bind_virq_t;
/*
* EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
* 1. A physical IRQ may be bound to at most one event channel per domain.
* 2. Only a sufficiently-privileged domain may bind to a physical IRQ.
*/
-#define EVTCHNOP_bind_pirq 2
+#define EVTCHNOP_bind_pirq 2
struct evtchn_bind_pirq {
- /* IN parameters. */
- uint32_t pirq;
+ /* IN parameters. */
+ uint32_t pirq;
#define BIND_PIRQ__WILL_SHARE 1
- uint32_t flags; /* BIND_PIRQ__* */
- /* OUT parameters. */
- evtchn_port_t port;
+ uint32_t flags; /* BIND_PIRQ__* */
+ /* OUT parameters. */
+ evtchn_port_t port;
};
+typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
/*
* EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
* 1. The allocated event channel is bound to the specified vcpu. The binding
* may not be changed.
*/
-#define EVTCHNOP_bind_ipi 7
+#define EVTCHNOP_bind_ipi 7
struct evtchn_bind_ipi {
- uint32_t vcpu;
- /* OUT parameters. */
- evtchn_port_t port;
+ uint32_t vcpu;
+ /* OUT parameters. */
+ evtchn_port_t port;
};
+typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
/*
* EVTCHNOP_close: Close a local event channel <port>. If the channel is
* interdomain then the remote end is placed in the unbound state
* (EVTCHNSTAT_unbound), awaiting a new connection.
*/
-#define EVTCHNOP_close 3
+#define EVTCHNOP_close 3
struct evtchn_close {
- /* IN parameters. */
- evtchn_port_t port;
+ /* IN parameters. */
+ evtchn_port_t port;
};
+typedef struct evtchn_close evtchn_close_t;
/*
* EVTCHNOP_send: Send an event to the remote end of the channel whose local
* endpoint is <port>.
*/
-#define EVTCHNOP_send 4
+#define EVTCHNOP_send 4
struct evtchn_send {
- /* IN parameters. */
- evtchn_port_t port;
+ /* IN parameters. */
+ evtchn_port_t port;
};
+typedef struct evtchn_send evtchn_send_t;
/*
* EVTCHNOP_status: Get the current status of the communication channel which
* 2. Only a sufficiently-privileged domain may obtain the status of an event
* channel for which <dom> is not DOMID_SELF.
*/
-#define EVTCHNOP_status 5
+#define EVTCHNOP_status 5
struct evtchn_status {
- /* IN parameters */
- domid_t dom;
- evtchn_port_t port;
- /* OUT parameters */
-#define EVTCHNSTAT_closed 0 /* Channel is not in use. */
-#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/
-#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */
-#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */
-#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */
-#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */
- uint32_t status;
- uint32_t vcpu; /* VCPU to which this channel is bound. */
- union {
- struct {
- domid_t dom;
- } unbound; /* EVTCHNSTAT_unbound */
- struct {
- domid_t dom;
- evtchn_port_t port;
- } interdomain; /* EVTCHNSTAT_interdomain */
- uint32_t pirq; /* EVTCHNSTAT_pirq */
- uint32_t virq; /* EVTCHNSTAT_virq */
- } u;
+ /* IN parameters */
+ domid_t dom;
+ evtchn_port_t port;
+ /* OUT parameters */
+#define EVTCHNSTAT_closed 0 /* Channel is not in use. */
+#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/
+#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */
+#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */
+#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */
+#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */
+ uint32_t status;
+ uint32_t vcpu; /* VCPU to which this channel is bound. */
+ union {
+ struct {
+ domid_t dom;
+ } unbound; /* EVTCHNSTAT_unbound */
+ struct {
+ domid_t dom;
+ evtchn_port_t port;
+ } interdomain; /* EVTCHNSTAT_interdomain */
+ uint32_t pirq; /* EVTCHNSTAT_pirq */
+ uint32_t virq; /* EVTCHNSTAT_virq */
+ } u;
};
+typedef struct evtchn_status evtchn_status_t;
/*
* EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
* event is pending.
* NOTES:
- * 1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
- * the binding. This binding cannot be changed.
- * 2. All other channels notify vcpu0 by default. This default is set when
+ * 1. IPI-bound channels always notify the vcpu specified at bind time.
+ * This binding cannot be changed.
+ * 2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
+ * This binding cannot be changed.
+ * 3. All other channels notify vcpu0 by default. This default is set when
* the channel is allocated (a port that is freed and subsequently reused
* has its binding reset to vcpu0).
*/
-#define EVTCHNOP_bind_vcpu 8
+#define EVTCHNOP_bind_vcpu 8
struct evtchn_bind_vcpu {
- /* IN parameters. */
- evtchn_port_t port;
- uint32_t vcpu;
+ /* IN parameters. */
+ evtchn_port_t port;
+ uint32_t vcpu;
};
+typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
/*
* EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
* a notification to the appropriate VCPU if an event is pending.
*/
-#define EVTCHNOP_unmask 9
+#define EVTCHNOP_unmask 9
struct evtchn_unmask {
- /* IN parameters. */
- evtchn_port_t port;
+ /* IN parameters. */
+ evtchn_port_t port;
+};
+typedef struct evtchn_unmask evtchn_unmask_t;
+
+/*
+ * EVTCHNOP_reset: Close all event channels associated with specified domain.
+ * NOTES:
+ * 1. <dom> may be specified as DOMID_SELF.
+ * 2. Only a sufficiently-privileged domain may specify other than DOMID_SELF.
+ */
+#define EVTCHNOP_reset 10
+struct evtchn_reset {
+ /* IN parameters. */
+ domid_t dom;
};
+typedef struct evtchn_reset evtchn_reset_t;
+/*
+ * Argument to event_channel_op_compat() hypercall. Superceded by new
+ * event_channel_op() hypercall since 0x00030202.
+ */
struct evtchn_op {
- uint32_t cmd; /* EVTCHNOP_* */
- union {
- struct evtchn_alloc_unbound alloc_unbound;
- struct evtchn_bind_interdomain bind_interdomain;
- struct evtchn_bind_virq bind_virq;
- struct evtchn_bind_pirq bind_pirq;
- struct evtchn_bind_ipi bind_ipi;
- struct evtchn_close close;
- struct evtchn_send send;
- struct evtchn_status status;
- struct evtchn_bind_vcpu bind_vcpu;
- struct evtchn_unmask unmask;
- } u;
+ uint32_t cmd; /* EVTCHNOP_* */
+ union {
+ struct evtchn_alloc_unbound alloc_unbound;
+ struct evtchn_bind_interdomain bind_interdomain;
+ struct evtchn_bind_virq bind_virq;
+ struct evtchn_bind_pirq bind_pirq;
+ struct evtchn_bind_ipi bind_ipi;
+ struct evtchn_close close;
+ struct evtchn_send send;
+ struct evtchn_status status;
+ struct evtchn_bind_vcpu bind_vcpu;
+ struct evtchn_unmask unmask;
+ } u;
};
-DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(evtchn_op);
+typedef struct evtchn_op evtchn_op_t;
+DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Feature flags, reported by XENVER_get_features.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2006, Keir Fraser <keir@xensource.com>
*/
*/
#define XENFEAT_pae_pgdir_above_4gb 4
+/* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
+#define XENFEAT_mmu_pt_update_preserve_ad 5
+
#define XENFEAT_NR_SUBMAPS 1
#endif /* __XEN_PUBLIC_FEATURES_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*/
uint32_t frame;
};
+typedef struct grant_entry grant_entry_t;
/*
* Type of grant entry.
* GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
* GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
* GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ * GTF_PAT, GTF_PWT, GTF_PCD: (x86) cache attribute flags for the grant [GST]
*/
#define _GTF_readonly (2)
#define GTF_readonly (1U<<_GTF_readonly)
#define GTF_reading (1U<<_GTF_reading)
#define _GTF_writing (4)
#define GTF_writing (1U<<_GTF_writing)
+#define _GTF_PWT (5)
+#define GTF_PWT (1U<<_GTF_PWT)
+#define _GTF_PCD (6)
+#define GTF_PCD (1U<<_GTF_PCD)
+#define _GTF_PAT (7)
+#define GTF_PAT (1U<<_GTF_PAT)
/*
* Subflags for GTF_accept_transfer:
grant_handle_t handle;
uint64_t dev_bus_addr;
};
-DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
+typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
/*
* GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
/* OUT parameters. */
int16_t status; /* GNTST_* */
};
-DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
+typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
/*
* GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
uint32_t nr_frames;
/* OUT parameters. */
int16_t status; /* GNTST_* */
- GUEST_HANDLE(ulong) frame_list;
+ XEN_GUEST_HANDLE(ulong) frame_list;
};
-DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_setup_table);
+typedef struct gnttab_setup_table gnttab_setup_table_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
/*
* GNTTABOP_dump_table: Dump the contents of the grant table to the
/* OUT parameters. */
int16_t status; /* GNTST_* */
};
-DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_dump_table);
+typedef struct gnttab_dump_table gnttab_dump_table_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
/*
* GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
#define GNTTABOP_transfer 4
struct gnttab_transfer {
/* IN parameters. */
- unsigned long mfn;
+ xen_pfn_t mfn;
domid_t domid;
grant_ref_t ref;
/* OUT parameters. */
int16_t status;
};
-DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_transfer);
+typedef struct gnttab_transfer gnttab_transfer_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
+
/*
* GNTTABOP_copy: Hypervisor based copy
#define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref)
#define GNTTABOP_copy 5
-struct gnttab_copy {
- /* IN parameters. */
- struct {
- union {
- grant_ref_t ref;
- unsigned long gmfn;
- } u;
- domid_t domid;
- uint16_t offset;
- } source, dest;
- uint16_t len;
- uint16_t flags; /* GNTCOPY_* */
- /* OUT parameters. */
- int16_t status;
-};
-DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy);
+typedef struct gnttab_copy {
+ /* IN parameters. */
+ struct {
+ union {
+ grant_ref_t ref;
+ xen_pfn_t gmfn;
+ } u;
+ domid_t domid;
+ uint16_t offset;
+ } source, dest;
+ uint16_t len;
+ uint16_t flags; /* GNTCOPY_* */
+ /* OUT parameters. */
+ int16_t status;
+} gnttab_copy_t;
+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_copy);
+DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
/*
* GNTTABOP_query_size: Query the current and maximum sizes of the shared
uint32_t max_nr_frames;
int16_t status; /* GNTST_* */
};
-DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_query_size);
+typedef struct gnttab_query_size gnttab_query_size_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
+
+/*
+ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
+ * tracked by <handle> but atomically replace the page table entry with one
+ * pointing to the machine address under <new_addr>. <new_addr> will be
+ * redirected to the null entry.
+ * NOTES:
+ * 1. The call may fail in an undefined manner if either mapping is not
+ * tracked by <handle>.
+ * 2. After executing a batch of unmaps, it is guaranteed that no stale
+ * mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_and_replace 7
+struct gnttab_unmap_and_replace {
+ /* IN parameters. */
+ uint64_t host_addr;
+ uint64_t new_addr;
+ grant_handle_t handle;
+ /* OUT parameters. */
+ int16_t status; /* GNTST_* */
+};
+typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
+
/*
* Bitfield values for update_pin_status.flags.
#define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */
#define GNTST_permission_denied (-8) /* Not enough privilege for operation. */
#define GNTST_bad_page (-9) /* Specified page was invalid for op. */
-#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */
+#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary. */
+#define GNTST_address_too_big (-11) /* transfer page address too large. */
#define GNTTABOP_error_msgs { \
"okay", \
"no spare translation slot in the I/O MMU", \
"permission denied", \
"bad page", \
- "copy arguments cross page boundary" \
+ "copy arguments cross page boundary", \
+ "page address size too large" \
}
#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_E820_H__
+#define __XEN_PUBLIC_HVM_E820_H__
+
+/* E820 location in HVM virtual address space. */
+#define HVM_E820_PAGE 0x00090000
+#define HVM_E820_NR_OFFSET 0x000001E8
+#define HVM_E820_OFFSET 0x000002D0
+
+#define HVM_BELOW_4G_RAM_END 0xF0000000
+#define HVM_BELOW_4G_MMIO_START HVM_BELOW_4G_RAM_END
+#define HVM_BELOW_4G_MMIO_LENGTH ((1ULL << 32) - HVM_BELOW_4G_MMIO_START)
+
+#endif /* __XEN_PUBLIC_HVM_E820_H__ */
--- /dev/null
+/******************************************************************************
+ * hvm/hvm_info_table.h
+ *
+ * HVM parameter and information table, written into guest memory map.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
+#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
+
+#define HVM_INFO_PFN 0x09F
+#define HVM_INFO_OFFSET 0x800
+#define HVM_INFO_PADDR ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
+
+struct hvm_info_table {
+ char signature[8]; /* "HVM INFO" */
+ uint32_t length;
+ uint8_t checksum;
+ uint8_t acpi_enabled;
+ uint8_t apic_mode;
+ uint32_t nr_vcpus;
+};
+
+#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
--- /dev/null
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
+#define __XEN_PUBLIC_HVM_HVM_OP_H__
+
+/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */
+#define HVMOP_set_param 0
+#define HVMOP_get_param 1
+struct xen_hvm_param {
+ domid_t domid; /* IN */
+ uint32_t index; /* IN */
+ uint64_t value; /* IN/OUT */
+};
+typedef struct xen_hvm_param xen_hvm_param_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t);
+
+/* Set the logical level of one of a domain's PCI INTx wires. */
+#define HVMOP_set_pci_intx_level 2
+struct xen_hvm_set_pci_intx_level {
+ /* Domain to be updated. */
+ domid_t domid;
+ /* PCI INTx identification in PCI topology (domain:bus:device:intx). */
+ uint8_t domain, bus, device, intx;
+ /* Assertion level (0 = unasserted, 1 = asserted). */
+ uint8_t level;
+};
+typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t);
+
+/* Set the logical level of one of a domain's ISA IRQ wires. */
+#define HVMOP_set_isa_irq_level 3
+struct xen_hvm_set_isa_irq_level {
+ /* Domain to be updated. */
+ domid_t domid;
+ /* ISA device identification, by ISA IRQ (0-15). */
+ uint8_t isa_irq;
+ /* Assertion level (0 = unasserted, 1 = asserted). */
+ uint8_t level;
+};
+typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t);
+
+#define HVMOP_set_pci_link_route 4
+struct xen_hvm_set_pci_link_route {
+ /* Domain to be updated. */
+ domid_t domid;
+ /* PCI link identifier (0-3). */
+ uint8_t link;
+ /* ISA IRQ (1-15), or 0 (disable link). */
+ uint8_t isa_irq;
+};
+typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
+
+/* Flushes all VCPU TLBs: @arg must be NULL. */
+#define HVMOP_flush_tlbs 5
+
+#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
--- /dev/null
+/*
+ * ioreq.h: I/O request definitions for device models
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _IOREQ_H_
+#define _IOREQ_H_
+
+#define IOREQ_READ 1
+#define IOREQ_WRITE 0
+
+#define STATE_IOREQ_NONE 0
+#define STATE_IOREQ_READY 1
+#define STATE_IOREQ_INPROCESS 2
+#define STATE_IORESP_READY 3
+
+#define IOREQ_TYPE_PIO 0 /* pio */
+#define IOREQ_TYPE_COPY 1 /* mmio ops */
+#define IOREQ_TYPE_TIMEOFFSET 7
+#define IOREQ_TYPE_INVALIDATE 8 /* mapcache */
+
+/*
+ * VMExit dispatcher should cooperate with instruction decoder to
+ * prepare this structure and notify service OS and DM by sending
+ * virq
+ */
+struct ioreq {
+ uint64_t addr; /* physical address */
+ uint64_t size; /* size in bytes */
+ uint64_t count; /* for rep prefixes */
+ uint64_t data; /* data (or paddr of data) */
+ uint8_t state:4;
+ uint8_t data_is_ptr:1; /* if 1, data above is the guest paddr
+ * of the real data to use. */
+ uint8_t dir:1; /* 1=read, 0=write */
+ uint8_t df:1;
+ uint8_t pad:1;
+ uint8_t type; /* I/O type */
+ uint8_t _pad0[6];
+ uint64_t io_count; /* How many IO done on a vcpu */
+};
+typedef struct ioreq ioreq_t;
+
+struct vcpu_iodata {
+ struct ioreq vp_ioreq;
+ /* Event channel port, used for notifications to/from the device model. */
+ uint32_t vp_eport;
+ uint32_t _pad0;
+};
+typedef struct vcpu_iodata vcpu_iodata_t;
+
+struct shared_iopage {
+ struct vcpu_iodata vcpu_iodata[1];
+};
+typedef struct shared_iopage shared_iopage_t;
+
+struct buf_ioreq {
+ uint8_t type; /* I/O type */
+ uint8_t pad:1;
+ uint8_t dir:1; /* 1=read, 0=write */
+ uint8_t size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */
+ uint32_t addr:20;/* physical address */
+ uint32_t data; /* data */
+};
+typedef struct buf_ioreq buf_ioreq_t;
+
+#define IOREQ_BUFFER_SLOT_NUM 511 /* 8 bytes each, plus 2 4-byte indexes */
+struct buffered_iopage {
+ unsigned int read_pointer;
+ unsigned int write_pointer;
+ buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
+}; /* NB. Size of this structure must be no greater than one page. */
+typedef struct buffered_iopage buffered_iopage_t;
+
+#if defined(__ia64__)
+struct pio_buffer {
+ uint32_t page_offset;
+ uint32_t pointer;
+ uint32_t data_end;
+ uint32_t buf_size;
+ void *opaque;
+};
+
+#define PIO_BUFFER_IDE_PRIMARY 0 /* I/O port = 0x1F0 */
+#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */
+#define PIO_BUFFER_ENTRY_NUM 2
+struct buffered_piopage {
+ struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM];
+ uint8_t buffer[1];
+};
+#endif /* defined(__ia64__) */
+
+#define ACPI_PM1A_EVT_BLK_ADDRESS 0x0000000000001f40
+#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08)
+#define ACPI_GPE0_BLK_ADDRESS (ACPI_PM_TMR_BLK_ADDRESS + 0x20)
+#define ACPI_GPE0_BLK_LEN 0x08
+
+#endif /* _IOREQ_H_ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
+#define __XEN_PUBLIC_HVM_PARAMS_H__
+
+#include "hvm_op.h"
+
+/*
+ * Parameter space for HVMOP_{set,get}_param.
+ */
+
+/*
+ * How should CPU0 event-channel notifications be delivered?
+ * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt).
+ * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
+ * Domain = val[47:32], Bus = val[31:16],
+ * DevFn = val[15: 8], IntX = val[ 1: 0]
+ * If val == 0 then CPU0 event-channel notifications are not delivered.
+ */
+#define HVM_PARAM_CALLBACK_IRQ 0
+
+/*
+ * These are not used by Xen. They are here for convenience of HVM-guest
+ * xenbus implementations.
+ */
+#define HVM_PARAM_STORE_PFN 1
+#define HVM_PARAM_STORE_EVTCHN 2
+
+#define HVM_PARAM_PAE_ENABLED 4
+
+#define HVM_PARAM_IOREQ_PFN 5
+
+#define HVM_PARAM_BUFIOREQ_PFN 6
+
+#ifdef __ia64__
+#define HVM_PARAM_NVRAM_FD 7
+#define HVM_PARAM_VHPT_SIZE 8
+#define HVM_PARAM_BUFPIOREQ_PFN 9
+#endif
+
+/*
+ * Set mode for virtual timers (currently x86 only):
+ * delay_for_missed_ticks (default):
+ * Do not advance a vcpu's time beyond the correct delivery time for
+ * interrupts that have been missed due to preemption. Deliver missed
+ * interrupts when the vcpu is rescheduled and advance the vcpu's virtual
+ * time stepwise for each one.
+ * no_delay_for_missed_ticks:
+ * As above, missed interrupts are delivered, but guest time always tracks
+ * wallclock (i.e., real) time while doing so.
+ * no_missed_ticks_pending:
+ * No missed interrupts are held pending. Instead, to ensure ticks are
+ * delivered at some non-zero rate, if we detect missed ticks then the
+ * internal tick alarm is not disabled if the VCPU is preempted during the
+ * next tick period.
+ * one_missed_tick_pending:
+ * Missed interrupts are collapsed together and delivered as one 'late tick'.
+ * Guest time always tracks wallclock (i.e., real) time.
+ */
+#define HVM_PARAM_TIMER_MODE 10
+#define HVMPTM_delay_for_missed_ticks 0
+#define HVMPTM_no_delay_for_missed_ticks 1
+#define HVMPTM_no_missed_ticks_pending 2
+#define HVMPTM_one_missed_tick_pending 3
+
+/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
+#define HVM_PARAM_HPET_ENABLED 11
+
+#define HVM_NR_PARAMS 12
+
+#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
--- /dev/null
+/*
+ * hvm/save.h
+ *
+ * Structure definitions for HVM state that is held by Xen and must
+ * be saved along with the domain's memory and device-model state.
+ *
+ * Copyright (c) 2007 XenSource Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_SAVE_H__
+#define __XEN_PUBLIC_HVM_SAVE_H__
+
+/*
+ * Structures in this header *must* have the same layout in 32bit
+ * and 64bit environments: this means that all fields must be explicitly
+ * sized types and aligned to their sizes, and the structs must be
+ * a multiple of eight bytes long.
+ *
+ * Only the state necessary for saving and restoring (i.e. fields
+ * that are analogous to actual hardware state) should go in this file.
+ * Internal mechanisms should be kept in Xen-private headers.
+ */
+
+#if !defined(__GNUC__) || defined(__STRICT_ANSI__)
+#error "Anonymous structs/unions are a GNU extension."
+#endif
+
+/*
+ * Each entry is preceded by a descriptor giving its type and length
+ */
+struct hvm_save_descriptor {
+ uint16_t typecode; /* Used to demux the various types below */
+ uint16_t instance; /* Further demux within a type */
+ uint32_t length; /* In bytes, *not* including this descriptor */
+};
+
+
+/*
+ * Each entry has a datatype associated with it: for example, the CPU state
+ * is saved as a HVM_SAVE_TYPE(CPU), which has HVM_SAVE_LENGTH(CPU),
+ * and is identified by a descriptor with typecode HVM_SAVE_CODE(CPU).
+ * DECLARE_HVM_SAVE_TYPE binds these things together with some type-system
+ * ugliness.
+ */
+
+#define DECLARE_HVM_SAVE_TYPE(_x, _code, _type) \
+ struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; }
+
+#define HVM_SAVE_TYPE(_x) typeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->t)
+#define HVM_SAVE_LENGTH(_x) (sizeof (HVM_SAVE_TYPE(_x)))
+#define HVM_SAVE_CODE(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->c))
+
+
+/*
+ * The series of save records is teminated by a zero-type, zero-length
+ * descriptor.
+ */
+
+struct hvm_save_end {};
+DECLARE_HVM_SAVE_TYPE(END, 0, struct hvm_save_end);
+
+#if defined(__i386__) || defined(__x86_64__)
+#include "../arch-x86/hvm/save.h"
+#elif defined(__ia64__)
+#include "../arch-ia64/hvm/save.h"
+#else
+#error "unsupported architecture"
+#endif
+
+#endif /* __XEN_PUBLIC_HVM_SAVE_H__ */
--- /dev/null
+/*
+ * vmx_assist.h: Context definitions for the VMXASSIST world switch.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Leendert van Doorn, leendert@watson.ibm.com
+ * Copyright (c) 2005, International Business Machines Corporation.
+ */
+
+#ifndef _VMX_ASSIST_H_
+#define _VMX_ASSIST_H_
+
+#define VMXASSIST_BASE 0xD0000
+#define VMXASSIST_MAGIC 0x17101966
+#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8)
+
+#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12)
+#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4)
+
+#ifndef __ASSEMBLY__
+
+#define NR_EXCEPTION_HANDLER 32
+#define NR_INTERRUPT_HANDLERS 16
+#define NR_TRAPS (NR_EXCEPTION_HANDLER+NR_INTERRUPT_HANDLERS)
+
+union vmcs_arbytes {
+ struct arbyte_fields {
+ unsigned int seg_type : 4,
+ s : 1,
+ dpl : 2,
+ p : 1,
+ reserved0 : 4,
+ avl : 1,
+ reserved1 : 1,
+ default_ops_size: 1,
+ g : 1,
+ null_bit : 1,
+ reserved2 : 15;
+ } fields;
+ unsigned int bytes;
+};
+
+/*
+ * World switch state
+ */
+struct vmx_assist_context {
+ uint32_t eip; /* execution pointer */
+ uint32_t esp; /* stack pointer */
+ uint32_t eflags; /* flags register */
+ uint32_t cr0;
+ uint32_t cr3; /* page table directory */
+ uint32_t cr4;
+ uint32_t idtr_limit; /* idt */
+ uint32_t idtr_base;
+ uint32_t gdtr_limit; /* gdt */
+ uint32_t gdtr_base;
+ uint32_t cs_sel; /* cs selector */
+ uint32_t cs_limit;
+ uint32_t cs_base;
+ union vmcs_arbytes cs_arbytes;
+ uint32_t ds_sel; /* ds selector */
+ uint32_t ds_limit;
+ uint32_t ds_base;
+ union vmcs_arbytes ds_arbytes;
+ uint32_t es_sel; /* es selector */
+ uint32_t es_limit;
+ uint32_t es_base;
+ union vmcs_arbytes es_arbytes;
+ uint32_t ss_sel; /* ss selector */
+ uint32_t ss_limit;
+ uint32_t ss_base;
+ union vmcs_arbytes ss_arbytes;
+ uint32_t fs_sel; /* fs selector */
+ uint32_t fs_limit;
+ uint32_t fs_base;
+ union vmcs_arbytes fs_arbytes;
+ uint32_t gs_sel; /* gs selector */
+ uint32_t gs_limit;
+ uint32_t gs_base;
+ union vmcs_arbytes gs_arbytes;
+ uint32_t tr_sel; /* task selector */
+ uint32_t tr_limit;
+ uint32_t tr_base;
+ union vmcs_arbytes tr_arbytes;
+ uint32_t ldtr_sel; /* ldtr selector */
+ uint32_t ldtr_limit;
+ uint32_t ldtr_base;
+ union vmcs_arbytes ldtr_arbytes;
+
+ unsigned char rm_irqbase[2];
+};
+typedef struct vmx_assist_context vmx_assist_context_t;
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _VMX_ASSIST_H_ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Unified block-device I/O interface for Xen guest OSes.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2003-2004, Keir Fraser
*/
* rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
*/
-typedef uint16_t blkif_vdev_t;
-typedef uint64_t blkif_sector_t;
+#ifndef blkif_vdev_t
+#define blkif_vdev_t uint16_t
+#endif
+#define blkif_sector_t uint64_t
/*
* REQUEST CODES.
#define BLKIF_OP_WRITE 1
/*
* Recognised only if "feature-barrier" is present in backend xenbus info.
- * The "feature_barrier" node contains a boolean indicating whether barrier
+ * The "feature-barrier" node contains a boolean indicating whether barrier
* requests are likely to succeed or fail. Either way, a barrier request
* may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
* the underlying block-device hardware. The boolean simply indicates whether
* create the "feature-barrier" node!
*/
#define BLKIF_OP_WRITE_BARRIER 2
+/*
+ * Recognised if "feature-flush-cache" is present in backend xenbus
+ * info. A flush will ask the underlying storage hardware to flush its
+ * non-volatile caches as appropriate. The "feature-flush-cache" node
+ * contains a boolean indicating whether flush requests are likely to
+ * succeed or fail. Either way, a flush request may fail at any time
+ * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
+ * block-device hardware. The boolean simply indicates whether or not it
+ * is worthwhile for the frontend to attempt flushes. If a backend does
+ * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
+ * "feature-flush-cache" node!
+ */
+#define BLKIF_OP_FLUSH_DISKCACHE 3
/*
* Maximum scatter/gather segments per request.
- * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
* NB. This could be 12 if the ring indexes weren't stored in the same page.
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+struct blkif_request_segment {
+ grant_ref_t gref; /* reference to I/O buffer frame */
+ /* @first_sect: first sector in frame to transfer (inclusive). */
+ /* @last_sect: last sector in frame to transfer (inclusive). */
+ uint8_t first_sect, last_sect;
+};
+
struct blkif_request {
- uint8_t operation; /* BLKIF_OP_??? */
- uint8_t nr_segments; /* number of segments */
- blkif_vdev_t handle; /* only for read/write requests */
- uint64_t id; /* private guest value, echoed in resp */
- blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
- struct blkif_request_segment {
- grant_ref_t gref; /* reference to I/O buffer frame */
- /* @first_sect: first sector in frame to transfer (inclusive). */
- /* @last_sect: last sector in frame to transfer (inclusive). */
- uint8_t first_sect, last_sect;
- } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
+typedef struct blkif_request blkif_request_t;
struct blkif_response {
- uint64_t id; /* copied from request */
- uint8_t operation; /* copied from request */
- int16_t status; /* BLKIF_RSP_??? */
+ uint64_t id; /* copied from request */
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
};
+typedef struct blkif_response blkif_response_t;
/*
* STATUS RETURN CODES.
#define VDISK_READONLY 0x4
#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Console I/O interface for Xen guest OSes.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2005, Keir Fraser
*/
};
#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*/
#define XENFB_TYPE_UPDATE 2
-struct xenfb_update {
- uint8_t type; /* XENFB_TYPE_UPDATE */
- int32_t x; /* source x */
- int32_t y; /* source y */
- int32_t width; /* rect width */
- int32_t height; /* rect height */
+struct xenfb_update
+{
+ uint8_t type; /* XENFB_TYPE_UPDATE */
+ int32_t x; /* source x */
+ int32_t y; /* source y */
+ int32_t width; /* rect width */
+ int32_t height; /* rect height */
+};
+
+/*
+ * Framebuffer resize notification event
+ * Capable backend sets feature-resize in xenstore.
+ */
+#define XENFB_TYPE_RESIZE 3
+
+struct xenfb_resize
+{
+ uint8_t type; /* XENFB_TYPE_RESIZE */
+ int32_t width; /* width in pixels */
+ int32_t height; /* height in pixels */
+ int32_t stride; /* stride in bytes */
+ int32_t depth; /* depth in bits */
+ int32_t offset; /* offset of the framebuffer in bytes */
};
#define XENFB_OUT_EVENT_SIZE 40
-union xenfb_out_event {
- uint8_t type;
- struct xenfb_update update;
- char pad[XENFB_OUT_EVENT_SIZE];
+union xenfb_out_event
+{
+ uint8_t type;
+ struct xenfb_update update;
+ struct xenfb_resize resize;
+ char pad[XENFB_OUT_EVENT_SIZE];
};
/* In events (backend -> frontend) */
#define XENFB_IN_EVENT_SIZE 40
-union xenfb_in_event {
- uint8_t type;
- char pad[XENFB_IN_EVENT_SIZE];
+union xenfb_in_event
+{
+ uint8_t type;
+ char pad[XENFB_IN_EVENT_SIZE];
};
/* shared page */
#define XENFB_IN_RING_LEN (XENFB_IN_RING_SIZE / XENFB_IN_EVENT_SIZE)
#define XENFB_IN_RING_OFFS 1024
#define XENFB_IN_RING(page) \
- ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS))
+ ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS))
#define XENFB_IN_RING_REF(page, idx) \
- (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN])
+ (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN])
#define XENFB_OUT_RING_SIZE 2048
#define XENFB_OUT_RING_LEN (XENFB_OUT_RING_SIZE / XENFB_OUT_EVENT_SIZE)
#define XENFB_OUT_RING_OFFS (XENFB_IN_RING_OFFS + XENFB_IN_RING_SIZE)
#define XENFB_OUT_RING(page) \
- ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS))
+ ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS))
#define XENFB_OUT_RING_REF(page, idx) \
- (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN])
-
-struct xenfb_page {
- uint32_t in_cons, in_prod;
- uint32_t out_cons, out_prod;
-
- int32_t width; /* width of the framebuffer (in pixels) */
- int32_t height; /* height of the framebuffer (in pixels) */
- uint32_t line_length; /* length of a row of pixels (in bytes) */
- uint32_t mem_length; /* length of the framebuffer (in bytes) */
- uint8_t depth; /* depth of a pixel (in bits) */
-
- /*
- * Framebuffer page directory
- *
- * Each directory page holds PAGE_SIZE / sizeof(*pd)
- * framebuffer pages, and can thus map up to PAGE_SIZE *
- * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and
- * sizeof(unsigned long) == 4, that's 4 Megs. Two directory
- * pages should be enough for a while.
- */
+ (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN])
+
+struct xenfb_page
+{
+ uint32_t in_cons, in_prod;
+ uint32_t out_cons, out_prod;
+
+ int32_t width; /* the width of the framebuffer (in pixels) */
+ int32_t height; /* the height of the framebuffer (in pixels) */
+ uint32_t line_length; /* the length of a row of pixels (in bytes) */
+ uint32_t mem_length; /* the length of the framebuffer (in bytes) */
+ uint8_t depth; /* the depth of a pixel (in bits) */
+
+ /*
+ * Framebuffer page directory
+ *
+ * Each directory page holds PAGE_SIZE / sizeof(*pd)
+ * framebuffer pages, and can thus map up to PAGE_SIZE *
+ * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and
+ * sizeof(unsigned long) == 4/8, that's 4 Megs 32 bit and 2 Megs
+ * 64 bit. 256 directories give enough room for a 512 Meg
+ * framebuffer with a max resolution of 12,800x10,240. Should
+ * be enough for a while with room leftover for expansion.
+ */
+#ifndef CONFIG_PARAVIRT_XEN
+ unsigned long pd[256];
+#else
+ /* Two directory pages should be enough for a while. */
unsigned long pd[2];
+#endif
};
/*
- * Wart: xenkbd needs to know resolution. Put it here until a better
- * solution is found, but don't leak it to the backend.
+ * Wart: xenkbd needs to know default resolution. Put it here until a
+ * better solution is found, but don't leak it to the backend.
*/
#ifdef __KERNEL__
#define XENFB_WIDTH 800
#endif
#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * fsif.h
+ *
+ * Interface to FS level split device drivers.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007, Grzegorz Milos, Sun Microsystems, Inc.
+ */
+
+#ifndef __XEN_PUBLIC_IO_FSIF_H__
+#define __XEN_PUBLIC_IO_FSIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+#define REQ_FILE_OPEN 1
+#define REQ_FILE_CLOSE 2
+#define REQ_FILE_READ 3
+#define REQ_FILE_WRITE 4
+#define REQ_STAT 5
+#define REQ_FILE_TRUNCATE 6
+#define REQ_REMOVE 7
+#define REQ_RENAME 8
+#define REQ_CREATE 9
+#define REQ_DIR_LIST 10
+#define REQ_CHMOD 11
+#define REQ_FS_SPACE 12
+#define REQ_FILE_SYNC 13
+
+struct fsif_open_request {
+ grant_ref_t gref;
+};
+
+struct fsif_close_request {
+ uint32_t fd;
+};
+
+struct fsif_read_request {
+ uint32_t fd;
+ grant_ref_t gref;
+ uint64_t len;
+ uint64_t offset;
+};
+
+struct fsif_write_request {
+ uint32_t fd;
+ grant_ref_t gref;
+ uint64_t len;
+ uint64_t offset;
+};
+
+struct fsif_stat_request {
+ uint32_t fd;
+ grant_ref_t gref;
+};
+
+/* This structure is a copy of some fields from stat structure, writen to the
+ * granted page. */
+struct fsif_stat_response {
+ int32_t stat_mode;
+ uint32_t stat_uid;
+ uint32_t stat_gid;
+ int32_t pad;
+ int64_t stat_size;
+ int64_t stat_atime;
+ int64_t stat_mtime;
+ int64_t stat_ctime;
+};
+
+struct fsif_truncate_request {
+ uint32_t fd;
+ int32_t pad;
+ int64_t length;
+};
+
+struct fsif_remove_request {
+ grant_ref_t gref;
+};
+
+struct fsif_rename_request {
+ uint16_t old_name_offset;
+ uint16_t new_name_offset;
+ grant_ref_t gref;
+};
+
+struct fsif_create_request {
+ int8_t directory;
+ int8_t pad;
+ int16_t pad2;
+ int32_t mode;
+ grant_ref_t gref;
+};
+
+struct fsif_list_request {
+ uint32_t offset;
+ grant_ref_t gref;
+};
+
+#define NR_FILES_SHIFT 0
+#define NR_FILES_SIZE 16 /* 16 bits for the number of files mask */
+#define NR_FILES_MASK (((1ULL << NR_FILES_SIZE) - 1) << NR_FILES_SHIFT)
+#define ERROR_SIZE 32 /* 32 bits for the error mask */
+#define ERROR_SHIFT (NR_FILES_SIZE + NR_FILES_SHIFT)
+#define ERROR_MASK (((1ULL << ERROR_SIZE) - 1) << ERROR_SHIFT)
+#define HAS_MORE_SHIFT (ERROR_SHIFT + ERROR_SIZE)
+#define HAS_MORE_FLAG (1ULL << HAS_MORE_SHIFT)
+
+struct fsif_chmod_request {
+ uint32_t fd;
+ int32_t mode;
+};
+
+struct fsif_space_request {
+ grant_ref_t gref;
+};
+
+struct fsif_sync_request {
+ uint32_t fd;
+};
+
+
+/* FS operation request */
+struct fsif_request {
+ uint8_t type; /* Type of the request */
+ uint8_t pad;
+ uint16_t id; /* Request ID, copied to the response */
+ uint32_t pad2;
+ union {
+ struct fsif_open_request fopen;
+ struct fsif_close_request fclose;
+ struct fsif_read_request fread;
+ struct fsif_write_request fwrite;
+ struct fsif_stat_request fstat;
+ struct fsif_truncate_request ftruncate;
+ struct fsif_remove_request fremove;
+ struct fsif_rename_request frename;
+ struct fsif_create_request fcreate;
+ struct fsif_list_request flist;
+ struct fsif_chmod_request fchmod;
+ struct fsif_space_request fspace;
+ struct fsif_sync_request fsync;
+ } u;
+};
+typedef struct fsif_request fsif_request_t;
+
+/* FS operation response */
+struct fsif_response {
+ uint16_t id;
+ uint16_t pad1;
+ uint32_t pad2;
+ uint64_t ret_val;
+};
+
+typedef struct fsif_response fsif_response_t;
+
+
+DEFINE_RING_TYPES(fsif, struct fsif_request, struct fsif_response);
+
+#define STATE_INITIALISED "init"
+#define STATE_READY "ready"
+
+
+
+#endif
*/
#define XENKBD_TYPE_POS 4
-struct xenkbd_motion {
- uint8_t type; /* XENKBD_TYPE_MOTION */
- int32_t rel_x; /* relative X motion */
- int32_t rel_y; /* relative Y motion */
+struct xenkbd_motion
+{
+ uint8_t type; /* XENKBD_TYPE_MOTION */
+ int32_t rel_x; /* relative X motion */
+ int32_t rel_y; /* relative Y motion */
+ int32_t rel_z; /* relative Z motion (wheel) */
};
-struct xenkbd_key {
- uint8_t type; /* XENKBD_TYPE_KEY */
- uint8_t pressed; /* 1 if pressed; 0 otherwise */
- uint32_t keycode; /* KEY_* from linux/input.h */
+struct xenkbd_key
+{
+ uint8_t type; /* XENKBD_TYPE_KEY */
+ uint8_t pressed; /* 1 if pressed; 0 otherwise */
+ uint32_t keycode; /* KEY_* from linux/input.h */
};
-struct xenkbd_position {
- uint8_t type; /* XENKBD_TYPE_POS */
- int32_t abs_x; /* absolute X position (in FB pixels) */
- int32_t abs_y; /* absolute Y position (in FB pixels) */
+struct xenkbd_position
+{
+ uint8_t type; /* XENKBD_TYPE_POS */
+ int32_t abs_x; /* absolute X position (in FB pixels) */
+ int32_t abs_y; /* absolute Y position (in FB pixels) */
+ int32_t rel_z; /* relative Z motion (wheel) */
};
#define XENKBD_IN_EVENT_SIZE 40
-union xenkbd_in_event {
- uint8_t type;
- struct xenkbd_motion motion;
- struct xenkbd_key key;
- struct xenkbd_position pos;
- char pad[XENKBD_IN_EVENT_SIZE];
+union xenkbd_in_event
+{
+ uint8_t type;
+ struct xenkbd_motion motion;
+ struct xenkbd_key key;
+ struct xenkbd_position pos;
+ char pad[XENKBD_IN_EVENT_SIZE];
};
/* Out events (frontend -> backend) */
#define XENKBD_OUT_EVENT_SIZE 40
-union xenkbd_out_event {
- uint8_t type;
- char pad[XENKBD_OUT_EVENT_SIZE];
+union xenkbd_out_event
+{
+ uint8_t type;
+ char pad[XENKBD_OUT_EVENT_SIZE];
};
/* shared page */
#define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE)
#define XENKBD_IN_RING_OFFS 1024
#define XENKBD_IN_RING(page) \
- ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS))
+ ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS))
#define XENKBD_IN_RING_REF(page, idx) \
- (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN])
+ (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN])
#define XENKBD_OUT_RING_SIZE 1024
#define XENKBD_OUT_RING_LEN (XENKBD_OUT_RING_SIZE / XENKBD_OUT_EVENT_SIZE)
#define XENKBD_OUT_RING_OFFS (XENKBD_IN_RING_OFFS + XENKBD_IN_RING_SIZE)
#define XENKBD_OUT_RING(page) \
- ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS))
+ ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS))
#define XENKBD_OUT_RING_REF(page, idx) \
- (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN])
+ (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN])
-struct xenkbd_page {
- uint32_t in_cons, in_prod;
- uint32_t out_cons, out_prod;
+struct xenkbd_page
+{
+ uint32_t in_cons, in_prod;
+ uint32_t out_cons, out_prod;
};
#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Unified network-device I/O interface for Xen guest OSes.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2003-2004, Keir Fraser
*/
#define _NETTXF_extra_info (3)
#define NETTXF_extra_info (1U<<_NETTXF_extra_info)
-struct xen_netif_tx_request {
+struct netif_tx_request {
grant_ref_t gref; /* Reference to buffer page */
uint16_t offset; /* Offset within buffer page */
uint16_t flags; /* NETTXF_* */
uint16_t id; /* Echoed in response message. */
uint16_t size; /* Packet size in bytes. */
};
+typedef struct netif_tx_request netif_tx_request_t;
/* Types of netif_extra_info descriptors. */
-#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
-#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
-#define XEN_NETIF_EXTRA_TYPE_MAX (2)
+#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
+#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
+#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD (2) /* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3) /* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_MAX (4)
/* netif_extra_info flags. */
#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
* This structure needs to fit within both netif_tx_request and
* netif_rx_response for compatibility.
*/
-struct xen_netif_extra_info {
- uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */
- uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
-
- union {
- struct {
- /*
- * Maximum payload size of each segment. For
- * example, for TCP this is just the path MSS.
- */
- uint16_t size;
-
- /*
- * GSO type. This determines the protocol of
- * the packet and any extra features required
- * to segment the packet properly.
- */
- uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
-
- /* Future expansion. */
- uint8_t pad;
-
- /*
- * GSO features. This specifies any extra GSO
- * features required to process this packet,
- * such as ECN support for TCPv4.
- */
- uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
- } gso;
-
- uint16_t pad[3];
- } u;
+struct netif_extra_info {
+ uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */
+ uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
+
+ union {
+ /*
+ * XEN_NETIF_EXTRA_TYPE_GSO:
+ */
+ struct {
+ /*
+ * Maximum payload size of each segment. For example, for TCP this
+ * is just the path MSS.
+ */
+ uint16_t size;
+
+ /*
+ * GSO type. This determines the protocol of the packet and any
+ * extra features required to segment the packet properly.
+ */
+ uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
+
+ /* Future expansion. */
+ uint8_t pad;
+
+ /*
+ * GSO features. This specifies any extra GSO features required
+ * to process this packet, such as ECN support for TCPv4.
+ */
+ uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
+ } gso;
+
+ /*
+ * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
+ * Backend advertises availability via 'feature-multicast-control'
+ * xenbus node containing value '1'.
+ * Frontend requests this feature by advertising
+ * 'request-multicast-control' xenbus node containing value '1'.
+ * If multicast control is requested then multicast flooding is
+ * disabled and the frontend must explicitly register its interest
+ * in multicast groups using dummy transmit requests containing
+ * MCAST_{ADD,DEL} extra-info fragments.
+ */
+ struct {
+ uint8_t addr[6]; /* Address to add/remove. */
+ } mcast;
+
+ uint16_t pad[3];
+ } u;
};
+typedef struct netif_extra_info netif_extra_info_t;
-struct xen_netif_tx_response {
- uint16_t id;
- int16_t status; /* NETIF_RSP_* */
+struct netif_tx_response {
+ uint16_t id;
+ int16_t status; /* NETIF_RSP_* */
};
+typedef struct netif_tx_response netif_tx_response_t;
-struct xen_netif_rx_request {
- uint16_t id; /* Echoed in response message. */
- grant_ref_t gref; /* Reference to incoming granted frame */
+struct netif_rx_request {
+ uint16_t id; /* Echoed in response message. */
+ grant_ref_t gref; /* Reference to incoming granted frame */
};
+typedef struct netif_rx_request netif_rx_request_t;
/* Packet data has been validated against protocol checksum. */
#define _NETRXF_data_validated (0)
#define _NETRXF_extra_info (3)
#define NETRXF_extra_info (1U<<_NETRXF_extra_info)
-struct xen_netif_rx_response {
+struct netif_rx_response {
uint16_t id;
uint16_t offset; /* Offset in page of start of received packet */
uint16_t flags; /* NETRXF_* */
int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
};
+typedef struct netif_rx_response netif_rx_response_t;
/*
* Generate netif ring structures and types.
*/
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
+DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
+#else
+#define xen_netif_tx_request netif_tx_request
+#define xen_netif_rx_request netif_rx_request
+#define xen_netif_tx_response netif_tx_response
+#define xen_netif_rx_response netif_rx_response
DEFINE_RING_TYPES(xen_netif_tx,
struct xen_netif_tx_request,
struct xen_netif_tx_response);
DEFINE_RING_TYPES(xen_netif_rx,
struct xen_netif_rx_request,
struct xen_netif_rx_response);
+#define xen_netif_extra_info netif_extra_info
+#endif
#define NETIF_RSP_DROPPED -2
#define NETIF_RSP_ERROR -1
#define NETIF_RSP_NULL 1
#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * PCI Backend/Frontend Common Data Structures & Macros
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCI_COMMON_H__
+#define __XEN_PCI_COMMON_H__
+
+/* Be sure to bump this number if you change this file */
+#define XEN_PCI_MAGIC "7"
+
+/* xen_pci_sharedinfo flags */
+#define _XEN_PCIF_active (0)
+#define XEN_PCIF_active (1<<_XEN_PCI_active)
+
+/* xen_pci_op commands */
+#define XEN_PCI_OP_conf_read (0)
+#define XEN_PCI_OP_conf_write (1)
+
+/* xen_pci_op error numbers */
+#define XEN_PCI_ERR_success (0)
+#define XEN_PCI_ERR_dev_not_found (-1)
+#define XEN_PCI_ERR_invalid_offset (-2)
+#define XEN_PCI_ERR_access_denied (-3)
+#define XEN_PCI_ERR_not_implemented (-4)
+/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
+#define XEN_PCI_ERR_op_failed (-5)
+
+struct xen_pci_op {
+ /* IN: what action to perform: XEN_PCI_OP_* */
+ uint32_t cmd;
+
+ /* OUT: will contain an error number (if any) from errno.h */
+ int32_t err;
+
+ /* IN: which device to touch */
+ uint32_t domain; /* PCI Domain/Segment */
+ uint32_t bus;
+ uint32_t devfn;
+
+ /* IN: which configuration registers to touch */
+ int32_t offset;
+ int32_t size;
+
+ /* IN/OUT: Contains the result after a READ or the value to WRITE */
+ uint32_t value;
+};
+
+struct xen_pci_sharedinfo {
+ /* flags - XEN_PCIF_* */
+ uint32_t flags;
+ struct xen_pci_op op;
+};
+
+#endif /* __XEN_PCI_COMMON_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+/******************************************************************************
+ * protocols.h
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
#ifndef __XEN_PROTOCOLS_H__
#define __XEN_PROTOCOLS_H__
*
* Shared producer-consumer ring macros.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Tim Deegan and Andrew Warfield November 2004.
*/
#ifndef __XEN_PUBLIC_IO_RING_H__
#define __XEN_PUBLIC_IO_RING_H__
+#include "../xen-compat.h"
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030208
+#define xen_mb() mb()
+#define xen_rmb() rmb()
+#define xen_wmb() wmb()
+#endif
+
typedef unsigned int RING_IDX;
/* Round a 32-bit unsigned constant down to the nearest power of two. */
-#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1))
+#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1))
#define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x))
#define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x))
#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x))
* power of two (so we can mask with (size-1) to loop around).
*/
#define __RING_SIZE(_s, _sz) \
- (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
+ (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
/*
* Macros to make the correct C datatypes for a new kind of ring.
*
* To make a new ring datatype, you need to have two message structures,
- * let's say struct request, and struct response already defined.
+ * let's say request_t, and response_t already defined.
*
* In a header where you want the ring datatype declared, you then do:
*
- * DEFINE_RING_TYPES(mytag, struct request, struct response);
+ * DEFINE_RING_TYPES(mytag, request_t, response_t);
*
* These expand out to give you a set of types, as you can see below.
* The most important of these are:
*
- * struct mytag_sring - The shared ring.
- * struct mytag_front_ring - The 'front' half of the ring.
- * struct mytag_back_ring - The 'back' half of the ring.
+ * mytag_sring_t - The shared ring.
+ * mytag_front_ring_t - The 'front' half of the ring.
+ * mytag_back_ring_t - The 'back' half of the ring.
*
* To initialize a ring in your code you need to know the location and size
* of the shared memory area (PAGE_SIZE, for instance). To initialise
* the front half:
*
- * struct mytag_front_ring front_ring;
- * SHARED_RING_INIT((struct mytag_sring *)shared_page);
- * FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
- * PAGE_SIZE);
+ * mytag_front_ring_t front_ring;
+ * SHARED_RING_INIT((mytag_sring_t *)shared_page);
+ * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
*
* Initializing the back follows similarly (note that only the front
* initializes the shared ring):
*
- * struct mytag_back_ring back_ring;
- * BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
- * PAGE_SIZE);
+ * mytag_back_ring_t back_ring;
+ * BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
*/
-#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \
- \
-/* Shared ring entry */ \
-union __name##_sring_entry { \
- __req_t req; \
- __rsp_t rsp; \
-}; \
- \
-/* Shared ring page */ \
-struct __name##_sring { \
- RING_IDX req_prod, req_event; \
- RING_IDX rsp_prod, rsp_event; \
- uint8_t pad[48]; \
- union __name##_sring_entry ring[1]; /* variable-length */ \
-}; \
- \
-/* "Front" end's private variables */ \
-struct __name##_front_ring { \
- RING_IDX req_prod_pvt; \
- RING_IDX rsp_cons; \
- unsigned int nr_ents; \
- struct __name##_sring *sring; \
-}; \
- \
-/* "Back" end's private variables */ \
-struct __name##_back_ring { \
- RING_IDX rsp_prod_pvt; \
- RING_IDX req_cons; \
- unsigned int nr_ents; \
- struct __name##_sring *sring; \
-};
+#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \
+ \
+/* Shared ring entry */ \
+union __name##_sring_entry { \
+ __req_t req; \
+ __rsp_t rsp; \
+}; \
+ \
+/* Shared ring page */ \
+struct __name##_sring { \
+ RING_IDX req_prod, req_event; \
+ RING_IDX rsp_prod, rsp_event; \
+ uint8_t pad[48]; \
+ union __name##_sring_entry ring[1]; /* variable-length */ \
+}; \
+ \
+/* "Front" end's private variables */ \
+struct __name##_front_ring { \
+ RING_IDX req_prod_pvt; \
+ RING_IDX rsp_cons; \
+ unsigned int nr_ents; \
+ struct __name##_sring *sring; \
+}; \
+ \
+/* "Back" end's private variables */ \
+struct __name##_back_ring { \
+ RING_IDX rsp_prod_pvt; \
+ RING_IDX req_cons; \
+ unsigned int nr_ents; \
+ struct __name##_sring *sring; \
+}; \
+ \
+/* Syntactic sugar */ \
+typedef struct __name##_sring __name##_sring_t; \
+typedef struct __name##_front_ring __name##_front_ring_t; \
+typedef struct __name##_back_ring __name##_back_ring_t
/*
* Macros for manipulating rings.
*/
/* Initialising empty rings */
-#define SHARED_RING_INIT(_s) do { \
- (_s)->req_prod = (_s)->rsp_prod = 0; \
- (_s)->req_event = (_s)->rsp_event = 1; \
- memset((_s)->pad, 0, sizeof((_s)->pad)); \
+#define SHARED_RING_INIT(_s) do { \
+ (_s)->req_prod = (_s)->rsp_prod = 0; \
+ (_s)->req_event = (_s)->rsp_event = 1; \
+ (void)memset((_s)->pad, 0, sizeof((_s)->pad)); \
} while(0)
-#define FRONT_RING_INIT(_r, _s, __size) do { \
- (_r)->req_prod_pvt = 0; \
- (_r)->rsp_cons = 0; \
- (_r)->nr_ents = __RING_SIZE(_s, __size); \
- (_r)->sring = (_s); \
+#define FRONT_RING_INIT(_r, _s, __size) do { \
+ (_r)->req_prod_pvt = 0; \
+ (_r)->rsp_cons = 0; \
+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
+ (_r)->sring = (_s); \
} while (0)
-#define BACK_RING_INIT(_r, _s, __size) do { \
- (_r)->rsp_prod_pvt = 0; \
- (_r)->req_cons = 0; \
- (_r)->nr_ents = __RING_SIZE(_s, __size); \
- (_r)->sring = (_s); \
+#define BACK_RING_INIT(_r, _s, __size) do { \
+ (_r)->rsp_prod_pvt = 0; \
+ (_r)->req_cons = 0; \
+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
+ (_r)->sring = (_s); \
} while (0)
/* Initialize to existing shared indexes -- for recovery */
-#define FRONT_RING_ATTACH(_r, _s, __size) do { \
- (_r)->sring = (_s); \
- (_r)->req_prod_pvt = (_s)->req_prod; \
- (_r)->rsp_cons = (_s)->rsp_prod; \
- (_r)->nr_ents = __RING_SIZE(_s, __size); \
+#define FRONT_RING_ATTACH(_r, _s, __size) do { \
+ (_r)->sring = (_s); \
+ (_r)->req_prod_pvt = (_s)->req_prod; \
+ (_r)->rsp_cons = (_s)->rsp_prod; \
+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
} while (0)
-#define BACK_RING_ATTACH(_r, _s, __size) do { \
- (_r)->sring = (_s); \
- (_r)->rsp_prod_pvt = (_s)->rsp_prod; \
- (_r)->req_cons = (_s)->req_prod; \
- (_r)->nr_ents = __RING_SIZE(_s, __size); \
+#define BACK_RING_ATTACH(_r, _s, __size) do { \
+ (_r)->sring = (_s); \
+ (_r)->rsp_prod_pvt = (_s)->rsp_prod; \
+ (_r)->req_cons = (_s)->req_prod; \
+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
} while (0)
/* How big is this ring? */
-#define RING_SIZE(_r) \
+#define RING_SIZE(_r) \
((_r)->nr_ents)
/* Number of free requests (for use on front side only). */
-#define RING_FREE_REQUESTS(_r) \
+#define RING_FREE_REQUESTS(_r) \
(RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
/* Test if there is an empty slot available on the front ring.
* (This is only meaningful from the front. )
*/
-#define RING_FULL(_r) \
+#define RING_FULL(_r) \
(RING_FREE_REQUESTS(_r) == 0)
/* Test if there are outstanding messages to be processed on a ring. */
-#define RING_HAS_UNCONSUMED_RESPONSES(_r) \
+#define RING_HAS_UNCONSUMED_RESPONSES(_r) \
((_r)->sring->rsp_prod - (_r)->rsp_cons)
-#define RING_HAS_UNCONSUMED_REQUESTS(_r) \
- ({ \
- unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \
- unsigned int rsp = RING_SIZE(_r) - \
- ((_r)->req_cons - (_r)->rsp_prod_pvt); \
- req < rsp ? req : rsp; \
- })
+#ifdef __GNUC__
+#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({ \
+ unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \
+ unsigned int rsp = RING_SIZE(_r) - \
+ ((_r)->req_cons - (_r)->rsp_prod_pvt); \
+ req < rsp ? req : rsp; \
+})
+#else
+/* Same as above, but without the nice GCC ({ ... }) syntax. */
+#define RING_HAS_UNCONSUMED_REQUESTS(_r) \
+ ((((_r)->sring->req_prod - (_r)->req_cons) < \
+ (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ? \
+ ((_r)->sring->req_prod - (_r)->req_cons) : \
+ (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))
+#endif
/* Direct access to individual ring elements, by index. */
-#define RING_GET_REQUEST(_r, _idx) \
+#define RING_GET_REQUEST(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
-#define RING_GET_RESPONSE(_r, _idx) \
+#define RING_GET_RESPONSE(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
/* Loop termination condition: Would the specified index overflow the ring? */
-#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
+#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
(((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
-#define RING_PUSH_REQUESTS(_r) do { \
- wmb(); /* back sees requests /before/ updated producer index */ \
- (_r)->sring->req_prod = (_r)->req_prod_pvt; \
+#define RING_PUSH_REQUESTS(_r) do { \
+ xen_wmb(); /* back sees requests /before/ updated producer index */ \
+ (_r)->sring->req_prod = (_r)->req_prod_pvt; \
} while (0)
-#define RING_PUSH_RESPONSES(_r) do { \
- wmb(); /* front sees responses /before/ updated producer index */ \
- (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \
+#define RING_PUSH_RESPONSES(_r) do { \
+ xen_wmb(); /* front sees resps /before/ updated producer index */ \
+ (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \
} while (0)
/*
* field appropriately.
*/
-#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \
- RING_IDX __old = (_r)->sring->req_prod; \
- RING_IDX __new = (_r)->req_prod_pvt; \
- wmb(); /* back sees requests /before/ updated producer index */ \
- (_r)->sring->req_prod = __new; \
- mb(); /* back sees new requests /before/ we check req_event */ \
- (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \
- (RING_IDX)(__new - __old)); \
+#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \
+ RING_IDX __old = (_r)->sring->req_prod; \
+ RING_IDX __new = (_r)->req_prod_pvt; \
+ xen_wmb(); /* back sees requests /before/ updated producer index */ \
+ (_r)->sring->req_prod = __new; \
+ xen_mb(); /* back sees new requests /before/ we check req_event */ \
+ (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \
+ (RING_IDX)(__new - __old)); \
} while (0)
-#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \
- RING_IDX __old = (_r)->sring->rsp_prod; \
- RING_IDX __new = (_r)->rsp_prod_pvt; \
- wmb(); /* front sees responses /before/ updated producer index */ \
- (_r)->sring->rsp_prod = __new; \
- mb(); /* front sees new responses /before/ we check rsp_event */ \
- (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \
- (RING_IDX)(__new - __old)); \
+#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \
+ RING_IDX __old = (_r)->sring->rsp_prod; \
+ RING_IDX __new = (_r)->rsp_prod_pvt; \
+ xen_wmb(); /* front sees resps /before/ updated producer index */ \
+ (_r)->sring->rsp_prod = __new; \
+ xen_mb(); /* front sees new resps /before/ we check rsp_event */ \
+ (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \
+ (RING_IDX)(__new - __old)); \
} while (0)
-#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \
- (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
- if (_work_to_do) break; \
- (_r)->sring->req_event = (_r)->req_cons + 1; \
- mb(); \
- (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
+#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \
+ (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
+ if (_work_to_do) break; \
+ (_r)->sring->req_event = (_r)->req_cons + 1; \
+ xen_mb(); \
+ (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
} while (0)
-#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \
- (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
- if (_work_to_do) break; \
- (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \
- mb(); \
- (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
+#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \
+ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
+ if (_work_to_do) break; \
+ (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \
+ xen_mb(); \
+ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
} while (0)
#endif /* __XEN_PUBLIC_IO_RING_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * tpmif.h
+ *
+ * TPM I/O interface for Xen guest OSes.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from tools/libxc/xen/io/netif.h
+ *
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_TPMIF_H__
+#define __XEN_PUBLIC_IO_TPMIF_H__
+
+#include "../grant_table.h"
+
+struct tpmif_tx_request {
+ unsigned long addr; /* Machine address of packet. */
+ grant_ref_t ref; /* grant table access reference */
+ uint16_t unused;
+ uint16_t size; /* Packet size in bytes. */
+};
+typedef struct tpmif_tx_request tpmif_tx_request_t;
+
+/*
+ * The TPMIF_TX_RING_SIZE defines the number of pages the
+ * front-end and backend can exchange (= size of array).
+ */
+typedef uint32_t TPMIF_RING_IDX;
+
+#define TPMIF_TX_RING_SIZE 1
+
+/* This structure must fit in a memory page. */
+
+struct tpmif_ring {
+ struct tpmif_tx_request req;
+};
+typedef struct tpmif_ring tpmif_ring_t;
+
+struct tpmif_tx_interface {
+ struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
+};
+typedef struct tpmif_tx_interface tpmif_tx_interface_t;
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Xenbus protocol details.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (C) 2005 XenSource Ltd.
*/
#ifndef _XEN_PUBLIC_IO_XENBUS_H
#define _XEN_PUBLIC_IO_XENBUS_H
-/* The state of either end of the Xenbus, i.e. the current communication
- status of initialisation across the bus. States here imply nothing about
- the state of the connection between the driver and the kernel's device
- layers. */
-enum xenbus_state
-{
- XenbusStateUnknown = 0,
- XenbusStateInitialising = 1,
- XenbusStateInitWait = 2, /* Finished early
- initialisation, but waiting
- for information from the peer
- or hotplug scripts. */
- XenbusStateInitialised = 3, /* Initialised and waiting for a
- connection from the peer. */
- XenbusStateConnected = 4,
- XenbusStateClosing = 5, /* The device is being closed
- due to an error or an unplug
- event. */
- XenbusStateClosed = 6
+/*
+ * The state of either end of the Xenbus, i.e. the current communication
+ * status of initialisation across the bus. States here imply nothing about
+ * the state of the connection between the driver and the kernel's device
+ * layers.
+ */
+enum xenbus_state {
+ XenbusStateUnknown = 0,
+
+ XenbusStateInitialising = 1,
+
+ /*
+ * InitWait: Finished early initialisation but waiting for information
+ * from the peer or hotplug scripts.
+ */
+ XenbusStateInitWait = 2,
+
+ /*
+ * Initialised: Waiting for a connection from the peer.
+ */
+ XenbusStateInitialised = 3,
+
+ XenbusStateConnected = 4,
+
+ /*
+ * Closing: The device is being closed due to an error or an unplug event.
+ */
+ XenbusStateClosing = 5,
+
+ XenbusStateClosed = 6,
+
+ /*
+ * Reconfiguring: The device is being reconfigured.
+ */
+ XenbusStateReconfiguring = 7,
+ XenbusStateReconfigured = 8
};
+typedef enum xenbus_state XenbusState;
#endif /* _XEN_PUBLIC_IO_XENBUS_H */
/*
* Local variables:
- * c-file-style: "linux"
- * indent-tabs-mode: t
- * c-indent-level: 8
- * c-basic-offset: 8
- * tab-width: 8
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
* End:
*/
/*
* Details of the "wire" protocol between Xen Store Daemon and client
* library or guest kernel.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (C) 2005 Rusty Russell IBM Corporation
*/
XS_SET_PERMS,
XS_WATCH_EVENT,
XS_ERROR,
- XS_IS_DOMAIN_INTRODUCED
+ XS_IS_DOMAIN_INTRODUCED,
+ XS_RESUME,
+ XS_SET_TARGET
};
#define XS_WRITE_NONE "NONE"
const char *errstring;
};
#define XSD_ERROR(x) { x, #x }
-static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
+static struct xsd_errors xsd_errors[]
+#if defined(__GNUC__)
+__attribute__((unused))
+#endif
+ = {
XSD_ERROR(EINVAL),
XSD_ERROR(EACCES),
XSD_ERROR(EEXIST),
XENSTORE_RING_IDX rsp_cons, rsp_prod;
};
+/* Violating this is very bad. See docs/misc/xenstore.txt. */
+#define XENSTORE_PAYLOAD_MAX 4096
+
+/* Violating these just gets you an error back */
+#define XENSTORE_ABS_PATH_MAX 3072
+#define XENSTORE_REL_PATH_MAX 2048
+
#endif /* _XS_WIRE_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * kexec.h - Public portion
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@verge.net.au>
+ * - Magnus Damm <magnus@valinux.co.jp>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+
+/* This file describes the Kexec / Kdump hypercall interface for Xen.
+ *
+ * Kexec under vanilla Linux allows a user to reboot the physical machine
+ * into a new user-specified kernel. The Xen port extends this idea
+ * to allow rebooting of the machine from dom0. When kexec for dom0
+ * is used to reboot, both the hypervisor and the domains get replaced
+ * with some other kernel. It is possible to kexec between vanilla
+ * Linux and Xen and back again. Xen to Xen works well too.
+ *
+ * The hypercall interface for kexec can be divided into three main
+ * types of hypercall operations:
+ *
+ * 1) Range information:
+ * This is used by the dom0 kernel to ask the hypervisor about various
+ * address information. This information is needed to allow kexec-tools
+ * to fill in the ELF headers for /proc/vmcore properly.
+ *
+ * 2) Load and unload of images:
+ * There are no big surprises here, the kexec binary from kexec-tools
+ * runs in userspace in dom0. The tool loads/unloads data into the
+ * dom0 kernel such as new kernel, initramfs and hypervisor. When
+ * loaded the dom0 kernel performs a load hypercall operation, and
+ * before releasing all page references the dom0 kernel calls unload.
+ *
+ * 3) Kexec operation:
+ * This is used to start a previously loaded kernel.
+ */
+
+#include "xen.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#define KEXEC_XEN_NO_PAGES 17
+#endif
+
+/*
+ * Prototype for this hypercall is:
+ * int kexec_op(int cmd, void *args)
+ * @cmd == KEXEC_CMD_...
+ * KEXEC operation to perform
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+/*
+ * Kexec supports two types of operation:
+ * - kexec into a regular kernel, very similar to a standard reboot
+ * - KEXEC_TYPE_DEFAULT is used to specify this type
+ * - kexec into a special "crash kernel", aka kexec-on-panic
+ * - KEXEC_TYPE_CRASH is used to specify this type
+ * - parts of our system may be broken at kexec-on-panic time
+ * - the code should be kept as simple and self-contained as possible
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH 1
+
+
+/* The kexec implementation for Xen allows the user to load two
+ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
+ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
+ * per "instance". The data mainly consists of machine address lists to pages
+ * together with destination addresses. The data in xen_kexec_image_t
+ * is passed to the "code page" which is one page of code that performs
+ * the final relocations before jumping to the new kernel.
+ */
+
+typedef struct xen_kexec_image {
+#if defined(__i386__) || defined(__x86_64__)
+ unsigned long page_list[KEXEC_XEN_NO_PAGES];
+#endif
+#if defined(__ia64__)
+ unsigned long reboot_code_buffer;
+#endif
+ unsigned long indirection_page;
+ unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ */
+#define KEXEC_CMD_kexec 0
+typedef struct xen_kexec_exec {
+ int type;
+} xen_kexec_exec_t;
+
+/*
+ * Load/Unload kernel image for kexec or kdump.
+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ * image == relocation information for kexec (ignored for unload) [in]
+ */
+#define KEXEC_CMD_kexec_load 1
+#define KEXEC_CMD_kexec_unload 2
+typedef struct xen_kexec_load {
+ int type;
+ xen_kexec_image_t image;
+} xen_kexec_load_t;
+
+#define KEXEC_RANGE_MA_CRASH 0 /* machine address and size of crash area */
+#define KEXEC_RANGE_MA_XEN 1 /* machine address and size of Xen itself */
+#define KEXEC_RANGE_MA_CPU 2 /* machine address and size of a CPU note */
+#define KEXEC_RANGE_MA_XENHEAP 3 /* machine address and size of xenheap
+ * Note that although this is adjacent
+ * to Xen it exists in a separate EFI
+ * region on ia64, and thus needs to be
+ * inserted into iomem_machine separately */
+#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
+ * the ia64_boot_param */
+#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
+ * of the EFI Memory Map */
+
+/*
+ * Find the address and size of certain memory areas
+ * range == KEXEC_RANGE_... [in]
+ * nr == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
+ * size == number of bytes reserved in window [out]
+ * start == address of the first byte in the window [out]
+ */
+#define KEXEC_CMD_kexec_get_range 3
+typedef struct xen_kexec_range {
+ int range;
+ int nr;
+ unsigned long size;
+ unsigned long start;
+} xen_kexec_range_t;
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * libelf.h
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XC_LIBELF__
+#define __XC_LIBELF__ 1
+
+#if defined(__i386__) || defined(__x86_64) || defined(__ia64__)
+#define XEN_ELF_LITTLE_ENDIAN
+#elif defined(__powerpc__)
+#define XEN_ELF_BIG_ENDIAN
+#else
+#error define architectural endianness
+#endif
+
+#undef ELFSIZE
+#include "elfnote.h"
+#include "elfstructs.h"
+#include "features.h"
+
+/* ------------------------------------------------------------------------ */
+
+typedef union {
+ Elf32_Ehdr e32;
+ Elf64_Ehdr e64;
+} elf_ehdr;
+
+typedef union {
+ Elf32_Phdr e32;
+ Elf64_Phdr e64;
+} elf_phdr;
+
+typedef union {
+ Elf32_Shdr e32;
+ Elf64_Shdr e64;
+} elf_shdr;
+
+typedef union {
+ Elf32_Sym e32;
+ Elf64_Sym e64;
+} elf_sym;
+
+typedef union {
+ Elf32_Rel e32;
+ Elf64_Rel e64;
+} elf_rel;
+
+typedef union {
+ Elf32_Rela e32;
+ Elf64_Rela e64;
+} elf_rela;
+
+typedef union {
+ Elf32_Note e32;
+ Elf64_Note e64;
+} elf_note;
+
+struct elf_binary {
+ /* elf binary */
+ const char *image;
+ size_t size;
+ char class;
+ char data;
+
+ const elf_ehdr *ehdr;
+ const char *sec_strtab;
+ const elf_shdr *sym_tab;
+ const char *sym_strtab;
+
+ /* loaded to */
+ char *dest;
+ uint64_t pstart;
+ uint64_t pend;
+ uint64_t reloc_offset;
+
+ uint64_t bsd_symtab_pstart;
+ uint64_t bsd_symtab_pend;
+
+#ifndef __XEN__
+ /* misc */
+ FILE *log;
+#endif
+ int verbose;
+};
+
+/* ------------------------------------------------------------------------ */
+/* accessing elf header fields */
+
+#ifdef XEN_ELF_BIG_ENDIAN
+# define NATIVE_ELFDATA ELFDATA2MSB
+#else
+# define NATIVE_ELFDATA ELFDATA2LSB
+#endif
+
+#define elf_32bit(elf) (ELFCLASS32 == (elf)->class)
+#define elf_64bit(elf) (ELFCLASS64 == (elf)->class)
+#define elf_msb(elf) (ELFDATA2MSB == (elf)->data)
+#define elf_lsb(elf) (ELFDATA2LSB == (elf)->data)
+#define elf_swap(elf) (NATIVE_ELFDATA != (elf)->data)
+
+#define elf_uval(elf, str, elem) \
+ ((ELFCLASS64 == (elf)->class) \
+ ? elf_access_unsigned((elf), (str), \
+ offsetof(typeof(*(str)),e64.elem), \
+ sizeof((str)->e64.elem)) \
+ : elf_access_unsigned((elf), (str), \
+ offsetof(typeof(*(str)),e32.elem), \
+ sizeof((str)->e32.elem)))
+
+#define elf_sval(elf, str, elem) \
+ ((ELFCLASS64 == (elf)->class) \
+ ? elf_access_signed((elf), (str), \
+ offsetof(typeof(*(str)),e64.elem), \
+ sizeof((str)->e64.elem)) \
+ : elf_access_signed((elf), (str), \
+ offsetof(typeof(*(str)),e32.elem), \
+ sizeof((str)->e32.elem)))
+
+#define elf_size(elf, str) \
+ ((ELFCLASS64 == (elf)->class) \
+ ? sizeof((str)->e64) : sizeof((str)->e32))
+
+uint64_t elf_access_unsigned(struct elf_binary *elf, const void *ptr,
+ uint64_t offset, size_t size);
+int64_t elf_access_signed(struct elf_binary *elf, const void *ptr,
+ uint64_t offset, size_t size);
+
+uint64_t elf_round_up(struct elf_binary *elf, uint64_t addr);
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_tools.c */
+
+int elf_shdr_count(struct elf_binary *elf);
+int elf_phdr_count(struct elf_binary *elf);
+
+const elf_shdr *elf_shdr_by_name(struct elf_binary *elf, const char *name);
+const elf_shdr *elf_shdr_by_index(struct elf_binary *elf, int index);
+const elf_phdr *elf_phdr_by_index(struct elf_binary *elf, int index);
+
+const char *elf_section_name(struct elf_binary *elf, const elf_shdr * shdr);
+const void *elf_section_start(struct elf_binary *elf, const elf_shdr * shdr);
+const void *elf_section_end(struct elf_binary *elf, const elf_shdr * shdr);
+
+const void *elf_segment_start(struct elf_binary *elf, const elf_phdr * phdr);
+const void *elf_segment_end(struct elf_binary *elf, const elf_phdr * phdr);
+
+const elf_sym *elf_sym_by_name(struct elf_binary *elf, const char *symbol);
+const elf_sym *elf_sym_by_index(struct elf_binary *elf, int index);
+
+const char *elf_note_name(struct elf_binary *elf, const elf_note * note);
+const void *elf_note_desc(struct elf_binary *elf, const elf_note * note);
+uint64_t elf_note_numeric(struct elf_binary *elf, const elf_note * note);
+const elf_note *elf_note_next(struct elf_binary *elf, const elf_note * note);
+
+int elf_is_elfbinary(const void *image);
+int elf_phdr_is_loadable(struct elf_binary *elf, const elf_phdr * phdr);
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_loader.c */
+
+int elf_init(struct elf_binary *elf, const char *image, size_t size);
+#ifdef __XEN__
+void elf_set_verbose(struct elf_binary *elf);
+#else
+void elf_set_logfile(struct elf_binary *elf, FILE * log, int verbose);
+#endif
+
+void elf_parse_binary(struct elf_binary *elf);
+void elf_load_binary(struct elf_binary *elf);
+
+void *elf_get_ptr(struct elf_binary *elf, unsigned long addr);
+uint64_t elf_lookup_addr(struct elf_binary *elf, const char *symbol);
+
+void elf_parse_bsdsyms(struct elf_binary *elf, uint64_t pstart); /* private */
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_relocate.c */
+
+int elf_reloc(struct elf_binary *elf);
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_dominfo.c */
+
+#define UNSET_ADDR ((uint64_t)-1)
+
+enum xen_elfnote_type {
+ XEN_ENT_NONE = 0,
+ XEN_ENT_LONG = 1,
+ XEN_ENT_STR = 2
+};
+
+struct xen_elfnote {
+ enum xen_elfnote_type type;
+ const char *name;
+ union {
+ const char *str;
+ uint64_t num;
+ } data;
+};
+
+struct elf_dom_parms {
+ /* raw */
+ const char *guest_info;
+ const void *elf_note_start;
+ const void *elf_note_end;
+ struct xen_elfnote elf_notes[XEN_ELFNOTE_MAX + 1];
+
+ /* parsed */
+ char guest_os[16];
+ char guest_ver[16];
+ char xen_ver[16];
+ char loader[16];
+ int pae;
+ int bsd_symtab;
+ uint64_t virt_base;
+ uint64_t virt_entry;
+ uint64_t virt_hypercall;
+ uint64_t virt_hv_start_low;
+ uint64_t elf_paddr_offset;
+ uint32_t f_supported[XENFEAT_NR_SUBMAPS];
+ uint32_t f_required[XENFEAT_NR_SUBMAPS];
+
+ /* calculated */
+ uint64_t virt_offset;
+ uint64_t virt_kstart;
+ uint64_t virt_kend;
+};
+
+static inline void elf_xen_feature_set(int nr, uint32_t * addr)
+{
+ addr[nr >> 5] |= 1 << (nr & 31);
+}
+static inline int elf_xen_feature_get(int nr, uint32_t * addr)
+{
+ return !!(addr[nr >> 5] & (1 << (nr & 31)));
+}
+
+int elf_xen_parse_features(const char *features,
+ uint32_t *supported,
+ uint32_t *required);
+int elf_xen_parse_note(struct elf_binary *elf,
+ struct elf_dom_parms *parms,
+ const elf_note *note);
+int elf_xen_parse_guest_info(struct elf_binary *elf,
+ struct elf_dom_parms *parms);
+int elf_xen_parse(struct elf_binary *elf,
+ struct elf_dom_parms *parms);
+
+#endif /* __XC_LIBELF__ */
*
* Memory reservation and information.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2005, Keir Fraser <keir@xensource.com>
*/
#define __XEN_PUBLIC_MEMORY_H__
/*
- * Increase or decrease the specified domain's memory reservation. Returns a
- * -ve errcode on failure, or the # extents successfully allocated or freed.
+ * Increase or decrease the specified domain's memory reservation. Returns the
+ * number of extents successfully allocated or freed.
* arg == addr of struct xen_memory_reservation.
*/
#define XENMEM_increase_reservation 0
* OUT: GMFN bases of extents that were allocated
* (NB. This command also updates the mach_to_phys translation table)
*/
+#ifndef CONFIG_PARAVIRT_XEN
+ XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
+#else
ulong extent_start;
+#endif
/* Number of extents, and size/alignment of each (2^extent_order pages). */
- unsigned long nr_extents;
+ xen_ulong_t nr_extents;
unsigned int extent_order;
/*
* Unprivileged domains can specify only DOMID_SELF.
*/
domid_t domid;
+};
+typedef struct xen_memory_reservation xen_memory_reservation_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
+/*
+ * An atomic exchange of memory pages. If return code is zero then
+ * @out.extent_list provides GMFNs of the newly-allocated memory.
+ * Returns zero on complete success, otherwise a negative error code.
+ * On complete success then always @nr_exchanged == @in.nr_extents.
+ * On partial success @nr_exchanged indicates how much work was done.
+ */
+#define XENMEM_exchange 11
+struct xen_memory_exchange {
+ /*
+ * [IN] Details of memory extents to be exchanged (GMFN bases).
+ * Note that @in.address_bits is ignored and unused.
+ */
+ struct xen_memory_reservation in;
+
+ /*
+ * [IN/OUT] Details of new memory extents.
+ * We require that:
+ * 1. @in.domid == @out.domid
+ * 2. @in.nr_extents << @in.extent_order ==
+ * @out.nr_extents << @out.extent_order
+ * 3. @in.extent_start and @out.extent_start lists must not overlap
+ * 4. @out.extent_start lists GPFN bases to be populated
+ * 5. @out.extent_start is overwritten with allocated GMFN bases
+ */
+ struct xen_memory_reservation out;
+
+ /*
+ * [OUT] Number of input extents that were successfully exchanged:
+ * 1. The first @nr_exchanged input extents were successfully
+ * deallocated.
+ * 2. The corresponding first entries in the output extent list correctly
+ * indicate the GMFNs that were successfully exchanged.
+ * 3. All other input and output extents are untouched.
+ * 4. If not all input exents are exchanged then the return code of this
+ * command will be non-zero.
+ * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
+ */
+ xen_ulong_t nr_exchanged;
};
+typedef struct xen_memory_exchange xen_memory_exchange_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
/*
* Returns the maximum machine frame number of mapped RAM in this system.
#define XENMEM_maximum_reservation 4
/*
+ * Returns the maximum GPFN in use by the guest, or -ve errcode on failure.
+ */
+#define XENMEM_maximum_gpfn 14
+
+/*
* Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
* mapping table. Architectures which do not have a m2p table do not implement
* this command.
* any large discontiguities in the machine address space, 2MB gaps in
* the machphys table will be represented by an MFN base of zero.
*/
+#ifndef CONFIG_PARAVIRT_XEN
+ XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
+#else
ulong extent_start;
+#endif
/*
* Number of extents written to the above array. This will be smaller
*/
unsigned int nr_extents;
};
+typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
+
+/*
+ * Returns the location in virtual address space of the machine_to_phys
+ * mapping table. Architectures which do not have a m2p table, or which do not
+ * map it by default into guest address space, do not implement this command.
+ * arg == addr of xen_machphys_mapping_t.
+ */
+#define XENMEM_machphys_mapping 12
+struct xen_machphys_mapping {
+ xen_ulong_t v_start, v_end; /* Start and end virtual addresses. */
+ xen_ulong_t max_mfn; /* Maximum MFN that can be looked up. */
+};
+typedef struct xen_machphys_mapping xen_machphys_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
/*
* Sets the GPFN at which a particular page appears in the specified guest's
unsigned int space;
/* Index into source mapping space. */
- unsigned long idx;
+ xen_ulong_t idx;
/* GPFN where the source mapping page should appear. */
- unsigned long gpfn;
+ xen_pfn_t gpfn;
};
+typedef struct xen_add_to_physmap xen_add_to_physmap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
/*
* Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
domid_t domid;
/* Length of list. */
- unsigned long nr_gpfns;
+ xen_ulong_t nr_gpfns;
/* List of GPFNs to translate. */
+#ifndef CONFIG_PARAVIRT_XEN
+ XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
+#else
ulong gpfn_list;
+#endif
/*
* Output list to contain MFN translations. May be the same as the input
* list (in which case each input GPFN is overwritten with the output MFN).
*/
+#ifndef CONFIG_PARAVIRT_XEN
+ XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
+#else
ulong mfn_list;
+#endif
};
+DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
+DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
+
+/*
+ * Returns the pseudo-physical memory map as it was when the domain
+ * was started (specified by XENMEM_set_memory_map).
+ * arg == addr of xen_memory_map_t.
+ */
+#define XENMEM_memory_map 9
+struct xen_memory_map {
+ /*
+ * On call the number of entries which can be stored in buffer. On
+ * return the number of entries which have been stored in
+ * buffer.
+ */
+ unsigned int nr_entries;
+
+ /*
+ * Entries in the buffer are in the same format as returned by the
+ * BIOS INT 0x15 EAX=0xE820 call.
+ */
+ XEN_GUEST_HANDLE(void) buffer;
+};
+typedef struct xen_memory_map xen_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
+
+/*
+ * Returns the real physical memory map. Passes the same structure as
+ * XENMEM_memory_map.
+ * arg == addr of xen_memory_map_t.
+ */
+#define XENMEM_machine_memory_map 10
+
+/*
+ * Set the pseudo-physical memory map of a domain, as returned by
+ * XENMEM_memory_map.
+ * arg == addr of xen_foreign_memory_map_t.
+ */
+#define XENMEM_set_memory_map 13
+struct xen_foreign_memory_map {
+ domid_t domid;
+ struct xen_memory_map map;
+};
+typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
#endif /* __XEN_PUBLIC_MEMORY_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * nmi.h
+ *
+ * NMI callback registration and reason codes.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_NMI_H__
+#define __XEN_PUBLIC_NMI_H__
+
+/*
+ * NMI reason codes:
+ * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
+ */
+ /* I/O-check error reported via ISA port 0x61, bit 6. */
+#define _XEN_NMIREASON_io_error 0
+#define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error)
+ /* Parity error reported via ISA port 0x61, bit 7. */
+#define _XEN_NMIREASON_parity_error 1
+#define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error)
+ /* Unknown hardware-generated NMI. */
+#define _XEN_NMIREASON_unknown 2
+#define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown)
+
+/*
+ * long nmi_op(unsigned int cmd, void *arg)
+ * NB. All ops return zero on success, else a negative error code.
+ */
+
+/*
+ * Register NMI callback for this (calling) VCPU. Currently this only makes
+ * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
+ * arg == pointer to xennmi_callback structure.
+ */
+#define XENNMI_register_callback 0
+struct xennmi_callback {
+ unsigned long handler_address;
+ unsigned long pad;
+};
+typedef struct xennmi_callback xennmi_callback_t;
+DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
+
+/*
+ * Deregister NMI callback for this (calling) VCPU.
+ * arg == NULL.
+ */
+#define XENNMI_unregister_callback 1
+
+#endif /* __XEN_PUBLIC_NMI_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
/*
* Prototype for this hypercall is:
* int physdev_op(int cmd, void *args)
- * @cmd == PHYSDEVOP_??? (physdev operation).
+ * @cmd == PHYSDEVOP_??? (physdev operation).
* @args == Operation-specific extra arguments (NULL if none).
*/
* Notify end-of-interrupt (EOI) for the specified IRQ.
* @arg == pointer to physdev_eoi structure.
*/
-#define PHYSDEVOP_eoi 12
+#define PHYSDEVOP_eoi 12
struct physdev_eoi {
- /* IN */
- uint32_t irq;
+ /* IN */
+ uint32_t irq;
};
+typedef struct physdev_eoi physdev_eoi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
/*
* Query the status of an IRQ line.
* @arg == pointer to physdev_irq_status_query structure.
*/
-#define PHYSDEVOP_irq_status_query 5
+#define PHYSDEVOP_irq_status_query 5
struct physdev_irq_status_query {
- /* IN */
- uint32_t irq;
- /* OUT */
- uint32_t flags; /* XENIRQSTAT_* */
+ /* IN */
+ uint32_t irq;
+ /* OUT */
+ uint32_t flags; /* XENIRQSTAT_* */
};
+typedef struct physdev_irq_status_query physdev_irq_status_query_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
-#define _XENIRQSTAT_needs_eoi (0)
-#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi)
+#define _XENIRQSTAT_needs_eoi (0)
+#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi)
/* IRQ shared by multiple guests? */
-#define _XENIRQSTAT_shared (1)
-#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared)
+#define _XENIRQSTAT_shared (1)
+#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared)
/*
* Set the current VCPU's I/O privilege level.
* @arg == pointer to physdev_set_iopl structure.
*/
-#define PHYSDEVOP_set_iopl 6
+#define PHYSDEVOP_set_iopl 6
struct physdev_set_iopl {
- /* IN */
- uint32_t iopl;
+ /* IN */
+ uint32_t iopl;
};
+typedef struct physdev_set_iopl physdev_set_iopl_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
/*
* Set the current VCPU's I/O-port permissions bitmap.
* @arg == pointer to physdev_set_iobitmap structure.
*/
-#define PHYSDEVOP_set_iobitmap 7
+#define PHYSDEVOP_set_iobitmap 7
struct physdev_set_iobitmap {
- /* IN */
- uint8_t * bitmap;
- uint32_t nr_ports;
+ /* IN */
+#if __XEN_INTERFACE_VERSION__ >= 0x00030205
+ XEN_GUEST_HANDLE(uint8) bitmap;
+#else
+ uint8_t *bitmap;
+#endif
+ uint32_t nr_ports;
};
+typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
/*
* Read or write an IO-APIC register.
* @arg == pointer to physdev_apic structure.
*/
-#define PHYSDEVOP_apic_read 8
-#define PHYSDEVOP_apic_write 9
+#define PHYSDEVOP_apic_read 8
+#define PHYSDEVOP_apic_write 9
struct physdev_apic {
- /* IN */
- unsigned long apic_physbase;
- uint32_t reg;
- /* IN or OUT */
- uint32_t value;
+ /* IN */
+ unsigned long apic_physbase;
+ uint32_t reg;
+ /* IN or OUT */
+ uint32_t value;
};
+typedef struct physdev_apic physdev_apic_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
/*
* Allocate or free a physical upcall vector for the specified IRQ line.
* @arg == pointer to physdev_irq structure.
*/
-#define PHYSDEVOP_alloc_irq_vector 10
-#define PHYSDEVOP_free_irq_vector 11
+#define PHYSDEVOP_alloc_irq_vector 10
+#define PHYSDEVOP_free_irq_vector 11
struct physdev_irq {
- /* IN */
- uint32_t irq;
- /* IN or OUT */
- uint32_t vector;
+ /* IN */
+ uint32_t irq;
+ /* IN or OUT */
+ uint32_t vector;
};
+typedef struct physdev_irq physdev_irq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
/*
* Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
* hypercall since 0x00030202.
*/
struct physdev_op {
- uint32_t cmd;
- union {
- struct physdev_irq_status_query irq_status_query;
- struct physdev_set_iopl set_iopl;
- struct physdev_set_iobitmap set_iobitmap;
- struct physdev_apic apic_op;
- struct physdev_irq irq_op;
- } u;
+ uint32_t cmd;
+ union {
+ struct physdev_irq_status_query irq_status_query;
+ struct physdev_set_iopl set_iopl;
+ struct physdev_set_iobitmap set_iobitmap;
+ struct physdev_apic apic_op;
+ struct physdev_irq irq_op;
+ } u;
};
+typedef struct physdev_op physdev_op_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
/*
* Notify that some PIRQ-bound event channels have been unmasked.
* ** This command is obsolete since interface version 0x00030202 and is **
- * ** unsupported by newer versions of Xen. **
+ * ** unsupported by newer versions of Xen. **
*/
-#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4
+#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4
/*
* These all-capitals physdev operation names are superceded by the new names
* (defined above) since interface version 0x00030202.
*/
-#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query
-#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl
-#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap
-#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read
-#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write
-#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector
-#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector
+#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query
+#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl
+#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap
+#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read
+#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write
+#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector
+#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector
#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
-#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared
+#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared
#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * platform.h
+ *
+ * Hardware platform operations. Intended for use by domain-0 kernel.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_PLATFORM_H__
+#define __XEN_PUBLIC_PLATFORM_H__
+
+#include "xen.h"
+
+#define XENPF_INTERFACE_VERSION 0x03000001
+
+/*
+ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
+ * 1 January, 1970 if the current system time was <system_time>.
+ */
+#define XENPF_settime 17
+struct xenpf_settime {
+ /* IN variables. */
+ uint32_t secs;
+ uint32_t nsecs;
+ uint64_t system_time;
+};
+typedef struct xenpf_settime xenpf_settime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
+
+/*
+ * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
+ * On x86, @type is an architecture-defined MTRR memory type.
+ * On success, returns the MTRR that was used (@reg) and a handle that can
+ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
+ * (x86-specific).
+ */
+#define XENPF_add_memtype 31
+struct xenpf_add_memtype {
+ /* IN variables. */
+ xen_pfn_t mfn;
+ uint64_t nr_mfns;
+ uint32_t type;
+ /* OUT variables. */
+ uint32_t handle;
+ uint32_t reg;
+};
+typedef struct xenpf_add_memtype xenpf_add_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t);
+
+/*
+ * Tear down an existing memory-range type. If @handle is remembered then it
+ * should be passed in to accurately tear down the correct setting (in case
+ * of overlapping memory regions with differing types). If it is not known
+ * then @handle should be set to zero. In all cases @reg must be set.
+ * (x86-specific).
+ */
+#define XENPF_del_memtype 32
+struct xenpf_del_memtype {
+ /* IN variables. */
+ uint32_t handle;
+ uint32_t reg;
+};
+typedef struct xenpf_del_memtype xenpf_del_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t);
+
+/* Read current type of an MTRR (x86-specific). */
+#define XENPF_read_memtype 33
+struct xenpf_read_memtype {
+ /* IN variables. */
+ uint32_t reg;
+ /* OUT variables. */
+ xen_pfn_t mfn;
+ uint64_t nr_mfns;
+ uint32_t type;
+};
+typedef struct xenpf_read_memtype xenpf_read_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t);
+
+#define XENPF_microcode_update 35
+struct xenpf_microcode_update {
+ /* IN variables. */
+ XEN_GUEST_HANDLE(void) data; /* Pointer to microcode data */
+ uint32_t length; /* Length of microcode data. */
+};
+typedef struct xenpf_microcode_update xenpf_microcode_update_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t);
+
+#define XENPF_platform_quirk 39
+#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */
+#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */
+#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */
+struct xenpf_platform_quirk {
+ /* IN variables. */
+ uint32_t quirk_id;
+};
+typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
+
+#define XENPF_firmware_info 50
+#define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */
+#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */
+#define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */
+struct xenpf_firmware_info {
+ /* IN variables. */
+ uint32_t type;
+ uint32_t index;
+ /* OUT variables. */
+ union {
+ struct {
+ /* Int13, Fn48: Check Extensions Present. */
+ uint8_t device; /* %dl: bios device number */
+ uint8_t version; /* %ah: major version */
+ uint16_t interface_support; /* %cx: support bitmap */
+ /* Int13, Fn08: Legacy Get Device Parameters. */
+ uint16_t legacy_max_cylinder; /* %cl[7:6]:%ch: max cyl # */
+ uint8_t legacy_max_head; /* %dh: max head # */
+ uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector # */
+ /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
+ /* NB. First uint16_t of buffer must be set to buffer size. */
+ XEN_GUEST_HANDLE(void) edd_params;
+ } disk_info; /* XEN_FW_DISK_INFO */
+ struct {
+ uint8_t device; /* bios device number */
+ uint32_t mbr_signature; /* offset 0x1b8 in mbr */
+ } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */
+ struct {
+ /* Int10, AX=4F15: Get EDID info. */
+ uint8_t capabilities;
+ uint8_t edid_transfer_time;
+ /* must refer to 128-byte buffer */
+ XEN_GUEST_HANDLE(uint8) edid;
+ } vbeddc_info; /* XEN_FW_VBEDDC_INFO */
+ } u;
+};
+typedef struct xenpf_firmware_info xenpf_firmware_info_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t);
+
+#define XENPF_enter_acpi_sleep 51
+struct xenpf_enter_acpi_sleep {
+ /* IN variables */
+ uint16_t pm1a_cnt_val; /* PM1a control value. */
+ uint16_t pm1b_cnt_val; /* PM1b control value. */
+ uint32_t sleep_state; /* Which state to enter (Sn). */
+ uint32_t flags; /* Must be zero. */
+};
+typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t);
+
+#define XENPF_change_freq 52
+struct xenpf_change_freq {
+ /* IN variables */
+ uint32_t flags; /* Must be zero. */
+ uint32_t cpu; /* Physical cpu. */
+ uint64_t freq; /* New frequency (Hz). */
+};
+typedef struct xenpf_change_freq xenpf_change_freq_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t);
+
+/*
+ * Get idle times (nanoseconds since boot) for physical CPUs specified in the
+ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is
+ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap
+ * bit set are written to. On return, @cpumap_bitmap is modified so that any
+ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry
+ * cleared.
+ */
+#define XENPF_getidletime 53
+struct xenpf_getidletime {
+ /* IN/OUT variables */
+ /* IN: CPUs to interrogate; OUT: subset of IN which are present */
+ XEN_GUEST_HANDLE(uint8) cpumap_bitmap;
+ /* IN variables */
+ /* Size of cpumap bitmap. */
+ uint32_t cpumap_nr_cpus;
+ /* Must be indexable for every cpu in cpumap_bitmap. */
+ XEN_GUEST_HANDLE(uint64) idletime;
+ /* OUT variables */
+ /* System time when the idletime snapshots were taken. */
+ uint64_t now;
+};
+typedef struct xenpf_getidletime xenpf_getidletime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
+
+struct xen_platform_op {
+ uint32_t cmd;
+ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
+ union {
+ struct xenpf_settime settime;
+ struct xenpf_add_memtype add_memtype;
+ struct xenpf_del_memtype del_memtype;
+ struct xenpf_read_memtype read_memtype;
+ struct xenpf_microcode_update microcode;
+ struct xenpf_platform_quirk platform_quirk;
+ struct xenpf_firmware_info firmware_info;
+ struct xenpf_enter_acpi_sleep enter_acpi_sleep;
+ struct xenpf_change_freq change_freq;
+ struct xenpf_getidletime getidletime;
+ uint8_t pad[128];
+ } u;
+};
+typedef struct xen_platform_op xen_platform_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t);
+
+#endif /* __XEN_PUBLIC_PLATFORM_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Scheduler state interactions
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2005, Keir Fraser <keir@xensource.com>
*/
/*
* The prototype for this hypercall is:
- * long sched_op_new(int cmd, void *arg)
+ * long sched_op(int cmd, void *arg)
* @cmd == SCHEDOP_??? (scheduler operation).
* @arg == Operation-specific extra argument(s), as described below.
*
- * **NOTE**:
- * Versions of Xen prior to 3.0.2 provide only the following legacy version
+ * Versions of Xen prior to 3.0.2 provided only the following legacy version
* of this hypercall, supporting only the commands yield, block and shutdown:
* long sched_op(int cmd, unsigned long arg)
* @cmd == SCHEDOP_??? (scheduler operation).
* @arg == 0 (SCHEDOP_yield and SCHEDOP_block)
* == SHUTDOWN_* code (SCHEDOP_shutdown)
+ * This legacy version is available to new guests as sched_op_compat().
*/
/*
struct sched_shutdown {
unsigned int reason; /* SHUTDOWN_* */
};
-DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(sched_shutdown);
+typedef struct sched_shutdown sched_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
/*
* Poll a set of event-channel ports. Return when one or more are pending. An
*/
#define SCHEDOP_poll 3
struct sched_poll {
- GUEST_HANDLE(evtchn_port_t) ports;
+ XEN_GUEST_HANDLE(evtchn_port_t) ports;
unsigned int nr_ports;
uint64_t timeout;
};
-DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(sched_poll);
+typedef struct sched_poll sched_poll_t;
+DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
+
+/*
+ * Declare a shutdown for another domain. The main use of this function is
+ * in interpreting shutdown requests and reasons for fully-virtualized
+ * domains. A para-virtualized domain may use SCHEDOP_shutdown directly.
+ * @arg == pointer to sched_remote_shutdown structure.
+ */
+#define SCHEDOP_remote_shutdown 4
+struct sched_remote_shutdown {
+ domid_t domain_id; /* Remote domain ID */
+ unsigned int reason; /* SHUTDOWN_xxx reason */
+};
+typedef struct sched_remote_shutdown sched_remote_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
/*
* Reason codes for SCHEDOP_shutdown. These may be interpreted by control
#define SHUTDOWN_crash 3 /* Tell controller we've crashed. */
#endif /* __XEN_PUBLIC_SCHED_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * sysctl.h
+ *
+ * System management operations. For use by node control stack.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_SYSCTL_H__
+#define __XEN_PUBLIC_SYSCTL_H__
+
+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
+#error "sysctl operations are intended for use by node control tools only"
+#endif
+
+#include "xen.h"
+#include "domctl.h"
+
+#define XEN_SYSCTL_INTERFACE_VERSION 0x00000006
+
+/*
+ * Read console content from Xen buffer ring.
+ */
+#define XEN_SYSCTL_readconsole 1
+struct xen_sysctl_readconsole {
+ /* IN: Non-zero -> clear after reading. */
+ uint8_t clear;
+ /* IN: Non-zero -> start index specified by @index field. */
+ uint8_t incremental;
+ uint8_t pad0, pad1;
+ /*
+ * IN: Start index for consuming from ring buffer (if @incremental);
+ * OUT: End index after consuming from ring buffer.
+ */
+ uint32_t index;
+ /* IN: Virtual address to write console data. */
+ XEN_GUEST_HANDLE_64(char) buffer;
+ /* IN: Size of buffer; OUT: Bytes written to buffer. */
+ uint32_t count;
+};
+typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t);
+
+/* Get trace buffers machine base address */
+#define XEN_SYSCTL_tbuf_op 2
+struct xen_sysctl_tbuf_op {
+ /* IN variables */
+#define XEN_SYSCTL_TBUFOP_get_info 0
+#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
+#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
+#define XEN_SYSCTL_TBUFOP_set_size 3
+#define XEN_SYSCTL_TBUFOP_enable 4
+#define XEN_SYSCTL_TBUFOP_disable 5
+ uint32_t cmd;
+ /* IN/OUT variables */
+ struct xenctl_cpumap cpu_mask;
+ uint32_t evt_mask;
+ /* OUT variables */
+ uint64_aligned_t buffer_mfn;
+ uint32_t size;
+};
+typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
+
+/*
+ * Get physical information about the host machine
+ */
+#define XEN_SYSCTL_physinfo 3
+struct xen_sysctl_physinfo {
+ /* IN variables. */
+ uint32_t threads_per_core;
+ uint32_t cores_per_socket;
+ uint32_t nr_cpus;
+ uint32_t nr_nodes;
+ uint32_t cpu_khz;
+ uint64_aligned_t total_pages;
+ uint64_aligned_t free_pages;
+ uint64_aligned_t scrub_pages;
+ uint32_t hw_cap[8];
+
+ /* IN/OUT variables. */
+ /*
+ * IN: maximum addressable entry in the caller-provided cpu_to_node array.
+ * OUT: largest cpu identifier in the system.
+ * If OUT is greater than IN then the cpu_to_node array is truncated!
+ */
+ uint32_t max_cpu_id;
+ /*
+ * If not NULL, this array is filled with node identifier for each cpu.
+ * If a cpu has no node information (e.g., cpu not present) then the
+ * sentinel value ~0u is written.
+ * The size of this array is specified by the caller in @max_cpu_id.
+ * If the actual @max_cpu_id is smaller than the array then the trailing
+ * elements of the array will not be written by the sysctl.
+ */
+ XEN_GUEST_HANDLE_64(uint32) cpu_to_node;
+};
+typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
+
+/*
+ * Get the ID of the current scheduler.
+ */
+#define XEN_SYSCTL_sched_id 4
+struct xen_sysctl_sched_id {
+ /* OUT variable */
+ uint32_t sched_id;
+};
+typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t);
+
+/* Interface for controlling Xen software performance counters. */
+#define XEN_SYSCTL_perfc_op 5
+/* Sub-operations: */
+#define XEN_SYSCTL_PERFCOP_reset 1 /* Reset all counters to zero. */
+#define XEN_SYSCTL_PERFCOP_query 2 /* Get perfctr information. */
+struct xen_sysctl_perfc_desc {
+ char name[80]; /* name of perf counter */
+ uint32_t nr_vals; /* number of values for this counter */
+};
+typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t);
+typedef uint32_t xen_sysctl_perfc_val_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t);
+
+struct xen_sysctl_perfc_op {
+ /* IN variables. */
+ uint32_t cmd; /* XEN_SYSCTL_PERFCOP_??? */
+ /* OUT variables. */
+ uint32_t nr_counters; /* number of counters description */
+ uint32_t nr_vals; /* number of values */
+ /* counter information (or NULL) */
+ XEN_GUEST_HANDLE_64(xen_sysctl_perfc_desc_t) desc;
+ /* counter values (or NULL) */
+ XEN_GUEST_HANDLE_64(xen_sysctl_perfc_val_t) val;
+};
+typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t);
+
+#define XEN_SYSCTL_getdomaininfolist 6
+struct xen_sysctl_getdomaininfolist {
+ /* IN variables. */
+ domid_t first_domain;
+ uint32_t max_domains;
+ XEN_GUEST_HANDLE_64(xen_domctl_getdomaininfo_t) buffer;
+ /* OUT variables. */
+ uint32_t num_domains;
+};
+typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t);
+
+/* Inject debug keys into Xen. */
+#define XEN_SYSCTL_debug_keys 7
+struct xen_sysctl_debug_keys {
+ /* IN variables. */
+ XEN_GUEST_HANDLE_64(char) keys;
+ uint32_t nr_keys;
+};
+typedef struct xen_sysctl_debug_keys xen_sysctl_debug_keys_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_debug_keys_t);
+
+/* Get physical CPU information. */
+#define XEN_SYSCTL_getcpuinfo 8
+struct xen_sysctl_cpuinfo {
+ uint64_aligned_t idletime;
+};
+typedef struct xen_sysctl_cpuinfo xen_sysctl_cpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpuinfo_t);
+struct xen_sysctl_getcpuinfo {
+ /* IN variables. */
+ uint32_t max_cpus;
+ XEN_GUEST_HANDLE_64(xen_sysctl_cpuinfo_t) info;
+ /* OUT variables. */
+ uint32_t nr_cpus;
+};
+typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t);
+
+#define XEN_SYSCTL_availheap 9
+struct xen_sysctl_availheap {
+ /* IN variables. */
+ uint32_t min_bitwidth; /* Smallest address width (zero if don't care). */
+ uint32_t max_bitwidth; /* Largest address width (zero if don't care). */
+ int32_t node; /* NUMA node of interest (-1 for all nodes). */
+ /* OUT variables. */
+ uint64_aligned_t avail_bytes;/* Bytes available in the specified region. */
+};
+typedef struct xen_sysctl_availheap xen_sysctl_availheap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t);
+
+struct xen_sysctl {
+ uint32_t cmd;
+ uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
+ union {
+ struct xen_sysctl_readconsole readconsole;
+ struct xen_sysctl_tbuf_op tbuf_op;
+ struct xen_sysctl_physinfo physinfo;
+ struct xen_sysctl_sched_id sched_id;
+ struct xen_sysctl_perfc_op perfc_op;
+ struct xen_sysctl_getdomaininfolist getdomaininfolist;
+ struct xen_sysctl_debug_keys debug_keys;
+ struct xen_sysctl_getcpuinfo getcpuinfo;
+ struct xen_sysctl_availheap availheap;
+ uint8_t pad[128];
+ } u;
+};
+typedef struct xen_sysctl xen_sysctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t);
+
+#endif /* __XEN_PUBLIC_SYSCTL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * include/public/trace.h
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Mark Williamson, (C) 2004 Intel Research Cambridge
+ * Copyright (C) 2005 Bin Ren
+ */
+
+#ifndef __XEN_PUBLIC_TRACE_H__
+#define __XEN_PUBLIC_TRACE_H__
+
+#define TRACE_EXTRA_MAX 7
+#define TRACE_EXTRA_SHIFT 28
+
+/* Trace classes */
+#define TRC_CLS_SHIFT 16
+#define TRC_GEN 0x0001f000 /* General trace */
+#define TRC_SCHED 0x0002f000 /* Xen Scheduler trace */
+#define TRC_DOM0OP 0x0004f000 /* Xen DOM0 operation trace */
+#define TRC_HVM 0x0008f000 /* Xen HVM trace */
+#define TRC_MEM 0x0010f000 /* Xen memory trace */
+#define TRC_PV 0x0020f000 /* Xen PV traces */
+#define TRC_ALL 0x0ffff000
+#define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff)
+#define TRC_HD_CYCLE_FLAG (1UL<<31)
+#define TRC_HD_INCLUDES_CYCLE_COUNT(x) ( !!( (x) & TRC_HD_CYCLE_FLAG ) )
+#define TRC_HD_EXTRA(x) (((x)>>TRACE_EXTRA_SHIFT)&TRACE_EXTRA_MAX)
+
+/* Trace subclasses */
+#define TRC_SUBCLS_SHIFT 12
+
+/* trace subclasses for SVM */
+#define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */
+#define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */
+
+/* Trace events per class */
+#define TRC_LOST_RECORDS (TRC_GEN + 1)
+#define TRC_TRACE_WRAP_BUFFER (TRC_GEN + 2)
+#define TRC_TRACE_CPU_CHANGE (TRC_GEN + 3)
+
+#define TRC_SCHED_DOM_ADD (TRC_SCHED + 1)
+#define TRC_SCHED_DOM_REM (TRC_SCHED + 2)
+#define TRC_SCHED_SLEEP (TRC_SCHED + 3)
+#define TRC_SCHED_WAKE (TRC_SCHED + 4)
+#define TRC_SCHED_YIELD (TRC_SCHED + 5)
+#define TRC_SCHED_BLOCK (TRC_SCHED + 6)
+#define TRC_SCHED_SHUTDOWN (TRC_SCHED + 7)
+#define TRC_SCHED_CTL (TRC_SCHED + 8)
+#define TRC_SCHED_ADJDOM (TRC_SCHED + 9)
+#define TRC_SCHED_SWITCH (TRC_SCHED + 10)
+#define TRC_SCHED_S_TIMER_FN (TRC_SCHED + 11)
+#define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12)
+#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13)
+#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
+#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
+
+#define TRC_MEM_PAGE_GRANT_MAP (TRC_MEM + 1)
+#define TRC_MEM_PAGE_GRANT_UNMAP (TRC_MEM + 2)
+#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
+
+#define TRC_PV_HYPERCALL (TRC_PV + 1)
+#define TRC_PV_TRAP (TRC_PV + 3)
+#define TRC_PV_PAGE_FAULT (TRC_PV + 4)
+#define TRC_PV_FORCED_INVALID_OP (TRC_PV + 5)
+#define TRC_PV_EMULATE_PRIVOP (TRC_PV + 6)
+#define TRC_PV_EMULATE_4GB (TRC_PV + 7)
+#define TRC_PV_MATH_STATE_RESTORE (TRC_PV + 8)
+#define TRC_PV_PAGING_FIXUP (TRC_PV + 9)
+#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10)
+#define TRC_PV_PTWR_EMULATION (TRC_PV + 11)
+#define TRC_PV_PTWR_EMULATION_PAE (TRC_PV + 12)
+ /* Indicates that addresses in trace record are 64 bits */
+#define TRC_PV_64_FLAG (0x100)
+
+/* trace events per subclass */
+#define TRC_HVM_VMENTRY (TRC_HVM_ENTRYEXIT + 0x01)
+#define TRC_HVM_VMEXIT (TRC_HVM_ENTRYEXIT + 0x02)
+#define TRC_HVM_VMEXIT64 (TRC_HVM_ENTRYEXIT + 0x03)
+#define TRC_HVM_PF_XEN (TRC_HVM_HANDLER + 0x01)
+#define TRC_HVM_PF_INJECT (TRC_HVM_HANDLER + 0x02)
+#define TRC_HVM_INJ_EXC (TRC_HVM_HANDLER + 0x03)
+#define TRC_HVM_INJ_VIRQ (TRC_HVM_HANDLER + 0x04)
+#define TRC_HVM_REINJ_VIRQ (TRC_HVM_HANDLER + 0x05)
+#define TRC_HVM_IO_READ (TRC_HVM_HANDLER + 0x06)
+#define TRC_HVM_IO_WRITE (TRC_HVM_HANDLER + 0x07)
+#define TRC_HVM_CR_READ (TRC_HVM_HANDLER + 0x08)
+#define TRC_HVM_CR_WRITE (TRC_HVM_HANDLER + 0x09)
+#define TRC_HVM_DR_READ (TRC_HVM_HANDLER + 0x0A)
+#define TRC_HVM_DR_WRITE (TRC_HVM_HANDLER + 0x0B)
+#define TRC_HVM_MSR_READ (TRC_HVM_HANDLER + 0x0C)
+#define TRC_HVM_MSR_WRITE (TRC_HVM_HANDLER + 0x0D)
+#define TRC_HVM_CPUID (TRC_HVM_HANDLER + 0x0E)
+#define TRC_HVM_INTR (TRC_HVM_HANDLER + 0x0F)
+#define TRC_HVM_NMI (TRC_HVM_HANDLER + 0x10)
+#define TRC_HVM_SMI (TRC_HVM_HANDLER + 0x11)
+#define TRC_HVM_VMMCALL (TRC_HVM_HANDLER + 0x12)
+#define TRC_HVM_HLT (TRC_HVM_HANDLER + 0x13)
+#define TRC_HVM_INVLPG (TRC_HVM_HANDLER + 0x14)
+#define TRC_HVM_MCE (TRC_HVM_HANDLER + 0x15)
+#define TRC_HVM_IO_ASSIST (TRC_HVM_HANDLER + 0x16)
+#define TRC_HVM_MMIO_ASSIST (TRC_HVM_HANDLER + 0x17)
+#define TRC_HVM_CLTS (TRC_HVM_HANDLER + 0x18)
+#define TRC_HVM_LMSW (TRC_HVM_HANDLER + 0x19)
+#define TRC_HVM_PF_XEN64 (TRC_HVM_HANDLER + 0x20)
+
+/* This structure represents a single trace buffer record. */
+struct t_rec {
+ uint32_t event:28;
+ uint32_t extra_u32:3; /* # entries in trailing extra_u32[] array */
+ uint32_t cycles_included:1; /* u.cycles or u.no_cycles? */
+ union {
+ struct {
+ uint32_t cycles_lo, cycles_hi; /* cycle counter timestamp */
+ uint32_t extra_u32[7]; /* event data items */
+ } cycles;
+ struct {
+ uint32_t extra_u32[7]; /* event data items */
+ } nocycles;
+ } u;
+};
+
+/*
+ * This structure contains the metadata for a single trace buffer. The head
+ * field, indexes into an array of struct t_rec's.
+ */
+struct t_buf {
+ /* Assume the data buffer size is X. X is generally not a power of 2.
+ * CONS and PROD are incremented modulo (2*X):
+ * 0 <= cons < 2*X
+ * 0 <= prod < 2*X
+ * This is done because addition modulo X breaks at 2^32 when X is not a
+ * power of 2:
+ * (((2^32 - 1) % X) + 1) % X != (2^32) % X
+ */
+ uint32_t cons; /* Offset of next item to be consumed by control tools. */
+ uint32_t prod; /* Offset of next item to be produced by Xen. */
+ /* Records follow immediately after the meta-data header. */
+};
+
+#endif /* __XEN_PUBLIC_TRACE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
/*
* Prototype for this hypercall is:
- * int vcpu_op(int cmd, int vcpuid, void *extra_args)
- * @cmd == VCPUOP_??? (VCPU operation).
- * @vcpuid == VCPU to operate on.
+ * int vcpu_op(int cmd, int vcpuid, void *extra_args)
+ * @cmd == VCPUOP_??? (VCPU operation).
+ * @vcpuid == VCPU to operate on.
* @extra_args == Operation-specific extra arguments (NULL if none).
*/
* newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
*
* @extra_arg == pointer to vcpu_guest_context structure containing initial
- * state for the VCPU.
+ * state for the VCPU.
*/
-#define VCPUOP_initialise 0
+#define VCPUOP_initialise 0
/*
* Bring up a VCPU. This makes the VCPU runnable. This operation will fail
* if the VCPU has not been initialised (VCPUOP_initialise).
*/
-#define VCPUOP_up 1
+#define VCPUOP_up 1
/*
* Bring down a VCPU (i.e., make it non-runnable).
* There are a few caveats that callers should observe:
- * 1. This operation may return, and VCPU_is_up may return false, before the
- * VCPU stops running (i.e., the command is asynchronous). It is a good
- * idea to ensure that the VCPU has entered a non-critical loop before
- * bringing it down. Alternatively, this operation is guaranteed
- * synchronous if invoked by the VCPU itself.
- * 2. After a VCPU is initialised, there is currently no way to drop all its
- * references to domain memory. Even a VCPU that is down still holds
- * memory references via its pagetable base pointer and GDT. It is good
- * practise to move a VCPU onto an 'idle' or default page table, LDT and
- * GDT before bringing it down.
+ * 1. This operation may return, and VCPU_is_up may return false, before the
+ * VCPU stops running (i.e., the command is asynchronous). It is a good
+ * idea to ensure that the VCPU has entered a non-critical loop before
+ * bringing it down. Alternatively, this operation is guaranteed
+ * synchronous if invoked by the VCPU itself.
+ * 2. After a VCPU is initialised, there is currently no way to drop all its
+ * references to domain memory. Even a VCPU that is down still holds
+ * memory references via its pagetable base pointer and GDT. It is good
+ * practise to move a VCPU onto an 'idle' or default page table, LDT and
+ * GDT before bringing it down.
*/
-#define VCPUOP_down 2
+#define VCPUOP_down 2
/* Returns 1 if the given VCPU is up. */
-#define VCPUOP_is_up 3
+#define VCPUOP_is_up 3
/*
* Return information about the state and running time of a VCPU.
* @extra_arg == pointer to vcpu_runstate_info structure.
*/
-#define VCPUOP_get_runstate_info 4
+#define VCPUOP_get_runstate_info 4
struct vcpu_runstate_info {
- /* VCPU's current state (RUNSTATE_*). */
- int state;
- /* When was current state entered (system time, ns)? */
- uint64_t state_entry_time;
- /*
- * Time spent in each RUNSTATE_* (ns). The sum of these times is
- * guaranteed not to drift from system time.
- */
- uint64_t time[4];
+ /* VCPU's current state (RUNSTATE_*). */
+ int state;
+ /* When was current state entered (system time, ns)? */
+ uint64_t state_entry_time;
+ /*
+ * Time spent in each RUNSTATE_* (ns). The sum of these times is
+ * guaranteed not to drift from system time.
+ */
+ uint64_t time[4];
};
-DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
+typedef struct vcpu_runstate_info vcpu_runstate_info_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
/* VCPU is currently running on a physical CPU. */
#define RUNSTATE_running 0
* Register a shared memory area from which the guest may obtain its own
* runstate information without needing to execute a hypercall.
* Notes:
- * 1. The registered address may be virtual or physical, depending on the
- * platform. The virtual address should be registered on x86 systems.
- * 2. Only one shared area may be registered per VCPU. The shared area is
- * updated by the hypervisor each time the VCPU is scheduled. Thus
- * runstate.state will always be RUNSTATE_running and
- * runstate.state_entry_time will indicate the system time at which the
- * VCPU was last scheduled to run.
+ * 1. The registered address may be virtual or physical or guest handle,
+ * depending on the platform. Virtual address or guest handle should be
+ * registered on x86 systems.
+ * 2. Only one shared area may be registered per VCPU. The shared area is
+ * updated by the hypervisor each time the VCPU is scheduled. Thus
+ * runstate.state will always be RUNSTATE_running and
+ * runstate.state_entry_time will indicate the system time at which the
+ * VCPU was last scheduled to run.
* @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
*/
#define VCPUOP_register_runstate_memory_area 5
struct vcpu_register_runstate_memory_area {
- union {
- GUEST_HANDLE(vcpu_runstate_info) h;
- struct vcpu_runstate_info *v;
- uint64_t p;
- } addr;
+ union {
+ XEN_GUEST_HANDLE(vcpu_runstate_info_t) h;
+ struct vcpu_runstate_info *v;
+ uint64_t p;
+ } addr;
};
+typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_runstate_memory_area_t);
/*
* Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
* which can be set via these commands. Periods smaller than one millisecond
* may not be supported.
*/
-#define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */
-#define VCPUOP_stop_periodic_timer 7 /* arg == NULL */
+#define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */
+#define VCPUOP_stop_periodic_timer 7 /* arg == NULL */
struct vcpu_set_periodic_timer {
- uint64_t period_ns;
+ uint64_t period_ns;
};
-DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
+typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);
/*
* Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
* timer which can be set via these commands.
*/
-#define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */
+#define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */
#define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */
struct vcpu_set_singleshot_timer {
- uint64_t timeout_abs_ns;
- uint32_t flags; /* VCPU_SSHOTTMR_??? */
+ uint64_t timeout_abs_ns; /* Absolute system time value in nanoseconds. */
+ uint32_t flags; /* VCPU_SSHOTTMR_??? */
};
-DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
+typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);
/* Flags to VCPUOP_set_singleshot_timer. */
/* Require the timeout to be in the future (return -ETIME if it's passed). */
* structure in a convenient place, such as in a per-cpu data area.
* The pointer need not be page aligned, but the structure must not
* cross a page boundary.
+ *
+ * This may be called only once per vcpu.
*/
-#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */
+#define VCPUOP_register_vcpu_info 10 /* arg == vcpu_register_vcpu_info_t */
struct vcpu_register_vcpu_info {
uint64_t mfn; /* mfn of page to place vcpu_info */
uint32_t offset; /* offset within page */
uint32_t rsvd; /* unused */
};
-DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
+typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
+
+/* Send an NMI to the specified VCPU. @extra_arg == NULL. */
+#define VCPUOP_send_nmi 11
+
+/*
+ * Get the physical ID information for a pinned vcpu's underlying physical
+ * processor. The physical ID informmation is architecture-specific.
+ * On x86: id[31:0]=apic_id, id[63:32]=acpi_id, and all values 0xff and
+ * greater are reserved.
+ * This command returns -EINVAL if it is not a valid operation for this VCPU.
+ */
+#define VCPUOP_get_physid 12 /* arg == vcpu_get_physid_t */
+struct vcpu_get_physid {
+ uint64_t phys_id;
+};
+typedef struct vcpu_get_physid vcpu_get_physid_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t);
+#define xen_vcpu_physid_to_x86_apicid(physid) \
+ ((((uint32_t)(physid)) >= 0xff) ? 0xff : ((uint8_t)(physid)))
+#define xen_vcpu_physid_to_x86_acpiid(physid) \
+ ((((uint32_t)((physid)>>32)) >= 0xff) ? 0xff : ((uint8_t)((physid)>>32)))
#endif /* __XEN_PUBLIC_VCPU_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
*
* Xen version, type, and compile information.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
* Copyright (c) 2005, Keir Fraser <keir@xensource.com>
*/
#ifndef __XEN_PUBLIC_VERSION_H__
#define __XEN_PUBLIC_VERSION_H__
-/* NB. All ops return zero on success, except XENVER_version. */
+/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
/* arg == NULL; returns major:minor (16:16). */
#define XENVER_version 0
/* arg == xen_extraversion_t. */
#define XENVER_extraversion 1
+typedef char xen_extraversion_t[16];
struct xen_extraversion {
- char extraversion[16];
+ xen_extraversion_t extraversion;
};
-#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion))
+#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
/* arg == xen_compile_info_t. */
#define XENVER_compile_info 2
char compile_domain[32];
char compile_date[32];
};
+typedef struct xen_compile_info xen_compile_info_t;
#define XENVER_capabilities 3
+typedef char xen_capabilities_info_t[1024];
struct xen_capabilities_info {
- char info[1024];
+ xen_capabilities_info_t info;
};
-#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info))
+#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
#define XENVER_changeset 4
+typedef char xen_changeset_info_t[64];
struct xen_changeset_info {
- char info[64];
+ xen_changeset_info_t info;
};
-#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info))
+#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
#define XENVER_platform_parameters 5
struct xen_platform_parameters {
unsigned long virt_start;
};
+typedef struct xen_platform_parameters xen_platform_parameters_t;
#define XENVER_get_features 6
struct xen_feature_info {
unsigned int submap_idx; /* IN: which 32-bit submap to return */
uint32_t submap; /* OUT: 32-bit submap */
};
+typedef struct xen_feature_info xen_feature_info_t;
/* Declares the features reported by XENVER_get_features. */
#include "features.h"
+/* arg == NULL; returns host memory page size. */
+#define XENVER_pagesize 7
+
+/* arg == xen_domain_handle_t. */
+#define XENVER_guest_handle 8
+
#endif /* __XEN_PUBLIC_VERSION_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * xen-compat.h
+ *
+ * Guest OS interface to Xen. Compatibility layer.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Christian Limpach
+ */
+
+#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
+#define __XEN_PUBLIC_XEN_COMPAT_H__
+
+#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030208
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+/* Xen is built with matching headers and implements the latest interface. */
+#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
+#elif !defined(__XEN_INTERFACE_VERSION__)
+/* Guests which do not specify a version get the legacy interface. */
+#define __XEN_INTERFACE_VERSION__ 0x00000000
+#endif
+
+#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
+#error "These header files do not support the requested interface version."
+#endif
+
+#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
*
* Guest OS interface to Xen.
*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
* Copyright (c) 2004, K A Fraser
*/
#ifndef __XEN_PUBLIC_XEN_H__
#define __XEN_PUBLIC_XEN_H__
-#include <asm/xen/interface.h>
+#include "xen-compat.h"
+#ifdef CONFIG_PARAVIRT_XEN
#include <asm/pvclock-abi.h>
+#endif
-/*
- * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
- */
+#if defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
+#include <asm/xen/interface.h>
+#elif defined(__i386__) || defined(__x86_64__)
+#include "arch-x86/xen.h"
+#elif defined(__ia64__)
+#include "arch-ia64.h"
+#elif defined(__powerpc__)
+#include "arch-powerpc.h"
+#else
+#error "Unsupported architecture"
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handles for primitive C types. */
+DEFINE_XEN_GUEST_HANDLE(char);
+__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
+DEFINE_XEN_GUEST_HANDLE(int);
+__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int);
+DEFINE_XEN_GUEST_HANDLE(long);
+__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+DEFINE_XEN_GUEST_HANDLE(void);
+
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
+#endif
/*
- * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
- * EAX = return value
- * (argument registers may be clobbered on return)
- * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.
- * RAX = return value
- * (argument registers not clobbered on return; RCX, R11 are)
+ * HYPERCALLS
*/
+
#define __HYPERVISOR_set_trap_table 0
#define __HYPERVISOR_mmu_update 1
#define __HYPERVISOR_set_gdt 2
#define __HYPERVISOR_stack_switch 3
#define __HYPERVISOR_set_callbacks 4
#define __HYPERVISOR_fpu_taskswitch 5
-#define __HYPERVISOR_sched_op 6
-#define __HYPERVISOR_dom0_op 7
+#define __HYPERVISOR_sched_op_compat 6 /* compat since 0x00030101 */
+#define __HYPERVISOR_platform_op 7
#define __HYPERVISOR_set_debugreg 8
#define __HYPERVISOR_get_debugreg 9
#define __HYPERVISOR_update_descriptor 10
#define __HYPERVISOR_multicall 13
#define __HYPERVISOR_update_va_mapping 14
#define __HYPERVISOR_set_timer_op 15
-#define __HYPERVISOR_event_channel_op_compat 16
+#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
#define __HYPERVISOR_xen_version 17
#define __HYPERVISOR_console_io 18
-#define __HYPERVISOR_physdev_op_compat 19
+#define __HYPERVISOR_physdev_op_compat 19 /* compat since 0x00030202 */
#define __HYPERVISOR_grant_table_op 20
#define __HYPERVISOR_vm_assist 21
#define __HYPERVISOR_update_va_mapping_otherdomain 22
#define __HYPERVISOR_vcpu_op 24
#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */
#define __HYPERVISOR_mmuext_op 26
-#define __HYPERVISOR_acm_op 27
+#define __HYPERVISOR_xsm_op 27
#define __HYPERVISOR_nmi_op 28
-#define __HYPERVISOR_sched_op_new 29
+#define __HYPERVISOR_sched_op 29
#define __HYPERVISOR_callback_op 30
#define __HYPERVISOR_xenoprof_op 31
#define __HYPERVISOR_event_channel_op 32
#define __HYPERVISOR_physdev_op 33
#define __HYPERVISOR_hvm_op 34
+#define __HYPERVISOR_sysctl 35
+#define __HYPERVISOR_domctl 36
+#define __HYPERVISOR_kexec_op 37
/* Architecture-specific hypercall definitions. */
#define __HYPERVISOR_arch_0 48
#define __HYPERVISOR_arch_7 55
/*
+ * HYPERCALL COMPATIBILITY.
+ */
+
+/* New sched_op hypercall introduced in 0x00030101. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030101 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+#undef __HYPERVISOR_sched_op
+#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
+#endif
+
+/* New event-channel and physdev hypercalls introduced in 0x00030202. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030202
+#undef __HYPERVISOR_event_channel_op
+#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
+#undef __HYPERVISOR_physdev_op
+#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
+#endif
+
+/* New platform_op hypercall introduced in 0x00030204. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030204 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op
+#endif
+
+/*
* VIRTUAL INTERRUPTS
*
* Virtual interrupts that a guest OS may receive from Xen.
+ *
+ * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
+ * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
+ * The latter can be allocated only once per guest: they must initially be
+ * allocated to VCPU0 but can subsequently be re-bound.
*/
-#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */
-#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */
-#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */
-#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
-#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
+#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */
+#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */
+#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */
+#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */
+#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */
+#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */
+#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */
+#define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */
/* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16
#define VIRQ_ARCH_7 23
#define NR_VIRQS 24
+
/*
* MMU-UPDATE REQUESTS
*
* ptr[:2] -- Machine address within the frame whose mapping to modify.
* The frame must belong to the FD, if one is specified.
* val -- Value to write into the mapping entry.
+ *
+ * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:
+ * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed
+ * with those in @val.
*/
-#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
-#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */
+#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
+#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */
+#define MMU_PT_UPDATE_PRESERVE_AD 2 /* atomically: *ptr = val | (*ptr&(A|D)) */
/*
* MMU EXTENDED OPERATIONS
#ifndef __ASSEMBLY__
struct mmuext_op {
- unsigned int cmd;
- union {
- /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
- unsigned long mfn;
- /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
- unsigned long linear_addr;
- } arg1;
- union {
- /* SET_LDT */
- unsigned int nr_ents;
- /* TLB_FLUSH_MULTI, INVLPG_MULTI */
- void *vcpumask;
- } arg2;
+ unsigned int cmd;
+ union {
+ /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+ xen_pfn_t mfn;
+ /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
+ unsigned long linear_addr;
+ } arg1;
+ union {
+ /* SET_LDT */
+ unsigned int nr_ents;
+ /* TLB_FLUSH_MULTI, INVLPG_MULTI */
+#if __XEN_INTERFACE_VERSION__ >= 0x00030205
+ XEN_GUEST_HANDLE(void) vcpumask;
+#else
+ void *vcpumask;
+#endif
+ } arg2;
};
-DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(mmuext_op);
+typedef struct mmuext_op mmuext_op_t;
+DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
#endif
/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
*/
#define VMASST_CMD_enable 0
#define VMASST_CMD_disable 1
+
+/* x86/32 guests: simulate full 4GB segment limits. */
#define VMASST_TYPE_4gb_segments 0
+
+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
#define VMASST_TYPE_4gb_segments_notify 1
+
+/*
+ * x86 guests: support writes to bottom-level PTEs.
+ * NB1. Page-directory entries cannot be written.
+ * NB2. Guest must continue to remove all writable mappings of PTEs.
+ */
#define VMASST_TYPE_writable_pagetables 2
+
+/* x86/PAE guests: support PDPTs above 4GB. */
#define VMASST_TYPE_pae_extended_cr3 3
-#define MAX_VMASST_TYPE 3
+
+#define MAX_VMASST_TYPE 3
#ifndef __ASSEMBLY__
uint64_t ptr; /* Machine address of PTE. */
uint64_t val; /* New contents of PTE. */
};
-DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(mmu_update);
+typedef struct mmu_update mmu_update_t;
+DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
/*
* Send an array of these to HYPERVISOR_multicall().
*/
struct multicall_entry {
unsigned long op;
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+ unsigned long result;
+#else
long result;
+#endif
unsigned long args[6];
};
-DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
+DEFINE_XEN_GUEST_HANDLE_STRUCT(multicall_entry);
+typedef struct multicall_entry multicall_entry_t;
+DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
/*
* Event channel endpoints per domain:
#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
struct vcpu_time_info {
- /*
- * Updates to the following values are preceded and followed
- * by an increment of 'version'. The guest can therefore
- * detect updates by looking for changes to 'version'. If the
- * least-significant bit of the version number is set then an
- * update is in progress and the guest must wait to read a
- * consistent set of values. The correct way to interact with
- * the version number is similar to Linux's seqlock: see the
- * implementations of read_seqbegin/read_seqretry.
- */
- uint32_t version;
- uint32_t pad0;
- uint64_t tsc_timestamp; /* TSC at last update of time vals. */
- uint64_t system_time; /* Time, in nanosecs, since boot. */
- /*
- * Current system time:
- * system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
- * CPU frequency (Hz):
- * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
- */
- uint32_t tsc_to_system_mul;
- int8_t tsc_shift;
- int8_t pad1[3];
+ /*
+ * Updates to the following values are preceded and followed by an
+ * increment of 'version'. The guest can therefore detect updates by
+ * looking for changes to 'version'. If the least-significant bit of
+ * the version number is set then an update is in progress and the guest
+ * must wait to read a consistent set of values.
+ * The correct way to interact with the version number is similar to
+ * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
+ */
+ uint32_t version;
+ uint32_t pad0;
+ uint64_t tsc_timestamp; /* TSC at last update of time vals. */
+ uint64_t system_time; /* Time, in nanosecs, since boot. */
+ /*
+ * Current system time:
+ * system_time +
+ * ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
+ * CPU frequency (Hz):
+ * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+ */
+ uint32_t tsc_to_system_mul;
+ int8_t tsc_shift;
+ int8_t pad1[3];
}; /* 32 bytes */
+typedef struct vcpu_time_info vcpu_time_info_t;
struct vcpu_info {
- /*
- * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
- * a pending notification for a particular VCPU. It is then cleared
- * by the guest OS /before/ checking for pending work, thus avoiding
- * a set-and-check race. Note that the mask is only accessed by Xen
- * on the CPU that is currently hosting the VCPU. This means that the
- * pending and mask flags can be updated by the guest without special
- * synchronisation (i.e., no need for the x86 LOCK prefix).
- * This may seem suboptimal because if the pending flag is set by
- * a different CPU then an IPI may be scheduled even when the mask
- * is set. However, note:
- * 1. The task of 'interrupt holdoff' is covered by the per-event-
- * channel mask bits. A 'noisy' event that is continually being
- * triggered can be masked at source at this very precise
- * granularity.
- * 2. The main purpose of the per-VCPU mask is therefore to restrict
- * reentrant execution: whether for concurrency control, or to
- * prevent unbounded stack usage. Whatever the purpose, we expect
- * that the mask will be asserted only for short periods at a time,
- * and so the likelihood of a 'spurious' IPI is suitably small.
- * The mask is read before making an event upcall to the guest: a
- * non-zero mask therefore guarantees that the VCPU will not receive
- * an upcall activation. The mask is cleared when the VCPU requests
- * to block: this avoids wakeup-waiting races.
- */
- uint8_t evtchn_upcall_pending;
- uint8_t evtchn_upcall_mask;
- unsigned long evtchn_pending_sel;
- struct arch_vcpu_info arch;
- struct pvclock_vcpu_time_info time;
+ /*
+ * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
+ * a pending notification for a particular VCPU. It is then cleared
+ * by the guest OS /before/ checking for pending work, thus avoiding
+ * a set-and-check race. Note that the mask is only accessed by Xen
+ * on the CPU that is currently hosting the VCPU. This means that the
+ * pending and mask flags can be updated by the guest without special
+ * synchronisation (i.e., no need for the x86 LOCK prefix).
+ * This may seem suboptimal because if the pending flag is set by
+ * a different CPU then an IPI may be scheduled even when the mask
+ * is set. However, note:
+ * 1. The task of 'interrupt holdoff' is covered by the per-event-
+ * channel mask bits. A 'noisy' event that is continually being
+ * triggered can be masked at source at this very precise
+ * granularity.
+ * 2. The main purpose of the per-VCPU mask is therefore to restrict
+ * reentrant execution: whether for concurrency control, or to
+ * prevent unbounded stack usage. Whatever the purpose, we expect
+ * that the mask will be asserted only for short periods at a time,
+ * and so the likelihood of a 'spurious' IPI is suitably small.
+ * The mask is read before making an event upcall to the guest: a
+ * non-zero mask therefore guarantees that the VCPU will not receive
+ * an upcall activation. The mask is cleared when the VCPU requests
+ * to block: this avoids wakeup-waiting races.
+ */
+ uint8_t evtchn_upcall_pending;
+ uint8_t evtchn_upcall_mask;
+ unsigned long evtchn_pending_sel;
+ struct arch_vcpu_info arch;
+#ifdef CONFIG_PARAVIRT_XEN
+ struct pvclock_vcpu_time_info time;
+#else
+ struct vcpu_time_info time;
+#endif
}; /* 64 bytes (x86) */
+#ifndef __XEN__
+typedef struct vcpu_info vcpu_info_t;
+#endif
/*
* Xen/kernel shared data -- pointer provided in start_info.
- * NB. We expect that this struct is smaller than a page.
+ *
+ * This structure is defined to be both smaller than a page, and the
+ * only data on the shared page, but may vary in actual size even within
+ * compatible Xen versions; guests should not rely on the size
+ * of this structure remaining constant.
*/
struct shared_info {
- struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
-
- /*
- * A domain can create "event channels" on which it can send and receive
- * asynchronous event notifications. There are three classes of event that
- * are delivered by this mechanism:
- * 1. Bi-directional inter- and intra-domain connections. Domains must
- * arrange out-of-band to set up a connection (usually by allocating
- * an unbound 'listener' port and avertising that via a storage service
- * such as xenstore).
- * 2. Physical interrupts. A domain with suitable hardware-access
- * privileges can bind an event-channel port to a physical interrupt
- * source.
- * 3. Virtual interrupts ('events'). A domain can bind an event-channel
- * port to a virtual interrupt source, such as the virtual-timer
- * device or the emergency console.
- *
- * Event channels are addressed by a "port index". Each channel is
- * associated with two bits of information:
- * 1. PENDING -- notifies the domain that there is a pending notification
- * to be processed. This bit is cleared by the guest.
- * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING
- * will cause an asynchronous upcall to be scheduled. This bit is only
- * updated by the guest. It is read-only within Xen. If a channel
- * becomes pending while the channel is masked then the 'edge' is lost
- * (i.e., when the channel is unmasked, the guest must manually handle
- * pending notifications as no upcall will be scheduled by Xen).
- *
- * To expedite scanning of pending notifications, any 0->1 pending
- * transition on an unmasked channel causes a corresponding bit in a
- * per-vcpu selector word to be set. Each bit in the selector covers a
- * 'C long' in the PENDING bitfield array.
- */
- unsigned long evtchn_pending[sizeof(unsigned long) * 8];
- unsigned long evtchn_mask[sizeof(unsigned long) * 8];
-
- /*
- * Wallclock time: updated only by control software. Guests should base
- * their gettimeofday() syscall on this wallclock-base value.
- */
- struct pvclock_wall_clock wc;
-
- struct arch_shared_info arch;
+ struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
+
+ /*
+ * A domain can create "event channels" on which it can send and receive
+ * asynchronous event notifications. There are three classes of event that
+ * are delivered by this mechanism:
+ * 1. Bi-directional inter- and intra-domain connections. Domains must
+ * arrange out-of-band to set up a connection (usually by allocating
+ * an unbound 'listener' port and avertising that via a storage service
+ * such as xenstore).
+ * 2. Physical interrupts. A domain with suitable hardware-access
+ * privileges can bind an event-channel port to a physical interrupt
+ * source.
+ * 3. Virtual interrupts ('events'). A domain can bind an event-channel
+ * port to a virtual interrupt source, such as the virtual-timer
+ * device or the emergency console.
+ *
+ * Event channels are addressed by a "port index". Each channel is
+ * associated with two bits of information:
+ * 1. PENDING -- notifies the domain that there is a pending notification
+ * to be processed. This bit is cleared by the guest.
+ * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING
+ * will cause an asynchronous upcall to be scheduled. This bit is only
+ * updated by the guest. It is read-only within Xen. If a channel
+ * becomes pending while the channel is masked then the 'edge' is lost
+ * (i.e., when the channel is unmasked, the guest must manually handle
+ * pending notifications as no upcall will be scheduled by Xen).
+ *
+ * To expedite scanning of pending notifications, any 0->1 pending
+ * transition on an unmasked channel causes a corresponding bit in a
+ * per-vcpu selector word to be set. Each bit in the selector covers a
+ * 'C long' in the PENDING bitfield array.
+ */
+ unsigned long evtchn_pending[sizeof(unsigned long) * 8];
+ unsigned long evtchn_mask[sizeof(unsigned long) * 8];
+
+ /*
+ * Wallclock time: updated only by control software. Guests should base
+ * their gettimeofday() syscall on this wallclock-base value.
+ */
+#ifdef CONFIG_PARAVIRT_XEN
+ struct pvclock_wall_clock wc;
+#else
+ uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */
+ uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */
+ uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */
+#endif
+
+ struct arch_shared_info arch;
};
+#ifndef __XEN__
+typedef struct shared_info shared_info_t;
+#endif
/*
- * Start-of-day memory layout for the initial domain (DOM0):
+ * Start-of-day memory layout:
* 1. The domain is started within contiguous virtual-memory region.
- * 2. The contiguous region begins and ends on an aligned 4MB boundary.
- * 3. The region start corresponds to the load address of the OS image.
- * If the load address is not 4MB aligned then the address is rounded down.
- * 4. This the order of bootstrap elements in the initial virtual region:
+ * 2. The contiguous region ends on an aligned 4MB boundary.
+ * 3. This the order of bootstrap elements in the initial virtual region:
* a. relocated kernel image
* b. initial ram disk [mod_start, mod_len]
* c. list of allocated page frames [mfn_list, nr_pages]
* d. start_info_t structure [register ESI (x86)]
* e. bootstrap page tables [pt_base, CR3 (x86)]
* f. bootstrap stack [register ESP (x86)]
- * 5. Bootstrap elements are packed together, but each is 4kB-aligned.
- * 6. The initial ram disk may be omitted.
- * 7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ * 4. Bootstrap elements are packed together, but each is 4kB-aligned.
+ * 5. The initial ram disk may be omitted.
+ * 6. The list of page frames forms a contiguous 'pseudo-physical' memory
* layout for the domain. In particular, the bootstrap virtual-memory
* region is a 1:1 mapping to the first section of the pseudo-physical map.
- * 8. All bootstrap elements are mapped read-writable for the guest OS. The
+ * 7. All bootstrap elements are mapped read-writable for the guest OS. The
* only exception is the bootstrap page table, which is mapped read-only.
- * 9. There is guaranteed to be at least 512kB padding after the final
+ * 8. There is guaranteed to be at least 512kB padding after the final
* bootstrap element. If necessary, the bootstrap virtual region is
* extended by an extra 4MB to ensure this.
*/
#define MAX_GUEST_CMDLINE 1024
struct start_info {
- /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */
- char magic[32]; /* "xen-<version>-<platform>". */
- unsigned long nr_pages; /* Total pages allocated to this domain. */
- unsigned long shared_info; /* MACHINE address of shared info struct. */
- uint32_t flags; /* SIF_xxx flags. */
- unsigned long store_mfn; /* MACHINE page number of shared page. */
- uint32_t store_evtchn; /* Event channel for store communication. */
- union {
- struct {
- unsigned long mfn; /* MACHINE page number of console page. */
- uint32_t evtchn; /* Event channel for console page. */
- } domU;
- struct {
- uint32_t info_off; /* Offset of console_info struct. */
- uint32_t info_size; /* Size of console_info struct from start.*/
- } dom0;
- } console;
- /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */
- unsigned long pt_base; /* VIRTUAL address of page directory. */
- unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */
- unsigned long mfn_list; /* VIRTUAL address of page-frame list. */
- unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
- unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
- int8_t cmd_line[MAX_GUEST_CMDLINE];
+ /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */
+ char magic[32]; /* "xen-<version>-<platform>". */
+ unsigned long nr_pages; /* Total pages allocated to this domain. */
+ unsigned long shared_info; /* MACHINE address of shared info struct. */
+ uint32_t flags; /* SIF_xxx flags. */
+ xen_pfn_t store_mfn; /* MACHINE page number of shared page. */
+ uint32_t store_evtchn; /* Event channel for store communication. */
+ union {
+ struct {
+ xen_pfn_t mfn; /* MACHINE page number of console page. */
+ uint32_t evtchn; /* Event channel for console page. */
+ } domU;
+ struct {
+ uint32_t info_off; /* Offset of console_info struct. */
+ uint32_t info_size; /* Size of console_info struct from start.*/
+ } dom0;
+ } console;
+ /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */
+ unsigned long pt_base; /* VIRTUAL address of page directory. */
+ unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */
+ unsigned long mfn_list; /* VIRTUAL address of page-frame list. */
+ unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
+ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
+ int8_t cmd_line[MAX_GUEST_CMDLINE];
};
+typedef struct start_info start_info_t;
+
+/* New console union for dom0 introduced in 0x00030203. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030203
+#define console_mfn console.domU.mfn
+#define console_evtchn console.domU.evtchn
+#endif
/* These flags are passed in the 'flags' field of start_info_t. */
#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
-typedef uint64_t cpumap_t;
+typedef struct dom0_vga_console_info {
+ uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
+#define XEN_VGATYPE_TEXT_MODE_3 0x03
+#define XEN_VGATYPE_VESA_LFB 0x23
+
+ union {
+ struct {
+ /* Font height, in pixels. */
+ uint16_t font_height;
+ /* Cursor location (column, row). */
+ uint16_t cursor_x, cursor_y;
+ /* Number of rows and columns (dimensions in characters). */
+ uint16_t rows, columns;
+ } text_mode_3;
+
+ struct {
+ /* Width and height, in pixels. */
+ uint16_t width, height;
+ /* Bytes per scan line. */
+ uint16_t bytes_per_line;
+ /* Bits per pixel. */
+ uint16_t bits_per_pixel;
+ /* LFB physical address, and size (in units of 64kB). */
+ uint32_t lfb_base;
+ uint32_t lfb_size;
+ /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
+ uint8_t red_pos, red_size;
+ uint8_t green_pos, green_size;
+ uint8_t blue_pos, blue_size;
+ uint8_t rsvd_pos, rsvd_size;
+#if __XEN_INTERFACE_VERSION__ >= 0x00030206
+ /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
+ uint32_t gbl_caps;
+ /* Mode attributes (offset 0x0, VESA command 0x4f01). */
+ uint16_t mode_attrs;
+#endif
+ } vesa_lfb;
+ } u;
+} dom0_vga_console_info_t;
+#define xen_vga_console_info dom0_vga_console_info
+#define xen_vga_console_info_t dom0_vga_console_info_t
typedef uint8_t xen_domain_handle_t[16];
#define __mk_unsigned_long(x) x ## UL
#define mk_unsigned_long(x) __mk_unsigned_long(x)
+__DEFINE_XEN_GUEST_HANDLE(uint8, uint8_t);
+__DEFINE_XEN_GUEST_HANDLE(uint16, uint16_t);
+__DEFINE_XEN_GUEST_HANDLE(uint32, uint32_t);
+__DEFINE_XEN_GUEST_HANDLE(uint64, uint64_t);
+
#else /* __ASSEMBLY__ */
/* In assembly code we cannot use C numeric constant suffixes. */
#endif /* !__ASSEMBLY__ */
+/* Default definitions for macros used by domctl/sysctl. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+#ifndef uint64_aligned_t
+#define uint64_aligned_t uint64_t
+#endif
+#ifndef XEN_GUEST_HANDLE_64
+#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name)
+#endif
+#endif
+
#endif /* __XEN_PUBLIC_XEN_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * xenoprof.h
+ *
+ * Interface for enabling system wide profiling based on hardware performance
+ * counters
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ * Written by Aravind Menon & Jose Renato Santos
+ */
+
+#ifndef __XEN_PUBLIC_XENOPROF_H__
+#define __XEN_PUBLIC_XENOPROF_H__
+
+#include "xen.h"
+
+/*
+ * Commands to HYPERVISOR_xenoprof_op().
+ */
+#define XENOPROF_init 0
+#define XENOPROF_reset_active_list 1
+#define XENOPROF_reset_passive_list 2
+#define XENOPROF_set_active 3
+#define XENOPROF_set_passive 4
+#define XENOPROF_reserve_counters 5
+#define XENOPROF_counter 6
+#define XENOPROF_setup_events 7
+#define XENOPROF_enable_virq 8
+#define XENOPROF_start 9
+#define XENOPROF_stop 10
+#define XENOPROF_disable_virq 11
+#define XENOPROF_release_counters 12
+#define XENOPROF_shutdown 13
+#define XENOPROF_get_buffer 14
+#define XENOPROF_set_backtrace 15
+#define XENOPROF_last_op 15
+
+#define MAX_OPROF_EVENTS 32
+#define MAX_OPROF_DOMAINS 25
+#define XENOPROF_CPU_TYPE_SIZE 64
+
+/* Xenoprof performance events (not Xen events) */
+struct event_log {
+ uint64_t eip;
+ uint8_t mode;
+ uint8_t event;
+};
+
+/* PC value that indicates a special code */
+#define XENOPROF_ESCAPE_CODE ~0UL
+/* Transient events for the xenoprof->oprofile cpu buf */
+#define XENOPROF_TRACE_BEGIN 1
+
+/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
+struct xenoprof_buf {
+ uint32_t event_head;
+ uint32_t event_tail;
+ uint32_t event_size;
+ uint32_t vcpu_id;
+ uint64_t xen_samples;
+ uint64_t kernel_samples;
+ uint64_t user_samples;
+ uint64_t lost_samples;
+ struct event_log event_log[1];
+};
+#ifndef __XEN__
+typedef struct xenoprof_buf xenoprof_buf_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
+#endif
+
+struct xenoprof_init {
+ int32_t num_events;
+ int32_t is_primary;
+ char cpu_type[XENOPROF_CPU_TYPE_SIZE];
+};
+typedef struct xenoprof_init xenoprof_init_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
+
+struct xenoprof_get_buffer {
+ int32_t max_samples;
+ int32_t nbuf;
+ int32_t bufsize;
+ uint64_t buf_gmaddr;
+};
+typedef struct xenoprof_get_buffer xenoprof_get_buffer_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t);
+
+struct xenoprof_counter {
+ uint32_t ind;
+ uint64_t count;
+ uint32_t enabled;
+ uint32_t event;
+ uint32_t hypervisor;
+ uint32_t kernel;
+ uint32_t user;
+ uint64_t unit_mask;
+};
+typedef struct xenoprof_counter xenoprof_counter_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
+
+typedef struct xenoprof_passive {
+ uint16_t domain_id;
+ int32_t max_samples;
+ int32_t nbuf;
+ int32_t bufsize;
+ uint64_t buf_gmaddr;
+} xenoprof_passive_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
+
+
+#endif /* __XEN_PUBLIC_XENOPROF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * acm.h: Xen access control module interface defintions
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005, International Business Machines Corporation.
+ */
+
+#ifndef _XEN_PUBLIC_ACM_H
+#define _XEN_PUBLIC_ACM_H
+
+#include "../xen.h"
+
+/* if ACM_DEBUG defined, all hooks should
+ * print a short trace message (comment it out
+ * when not in testing mode )
+ */
+/* #define ACM_DEBUG */
+
+#ifdef ACM_DEBUG
+# define printkd(fmt, args...) printk(fmt,## args)
+#else
+# define printkd(fmt, args...)
+#endif
+
+/* default ssid reference value if not supplied */
+#define ACM_DEFAULT_SSID 0x0
+#define ACM_DEFAULT_LOCAL_SSID 0x0
+
+/* Internal ACM ERROR types */
+#define ACM_OK 0
+#define ACM_UNDEF -1
+#define ACM_INIT_SSID_ERROR -2
+#define ACM_INIT_SOID_ERROR -3
+#define ACM_ERROR -4
+
+/* External ACCESS DECISIONS */
+#define ACM_ACCESS_PERMITTED 0
+#define ACM_ACCESS_DENIED -111
+#define ACM_NULL_POINTER_ERROR -200
+
+/*
+ Error codes reported in when trying to test for a new policy
+ These error codes are reported in an array of tuples where
+ each error code is followed by a parameter describing the error
+ more closely, such as a domain id.
+*/
+#define ACM_EVTCHN_SHARING_VIOLATION 0x100
+#define ACM_GNTTAB_SHARING_VIOLATION 0x101
+#define ACM_DOMAIN_LOOKUP 0x102
+#define ACM_CHWALL_CONFLICT 0x103
+#define ACM_SSIDREF_IN_USE 0x104
+
+
+/* primary policy in lower 4 bits */
+#define ACM_NULL_POLICY 0
+#define ACM_CHINESE_WALL_POLICY 1
+#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
+#define ACM_POLICY_UNDEFINED 15
+
+/* combinations have secondary policy component in higher 4bit */
+#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
+ ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
+
+/* policy: */
+#define ACM_POLICY_NAME(X) \
+ ((X) == (ACM_NULL_POLICY)) ? "NULL" : \
+ ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" : \
+ ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
+ ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
+ "UNDEFINED"
+
+/* the following policy versions must be increased
+ * whenever the interpretation of the related
+ * policy's data structure changes
+ */
+#define ACM_POLICY_VERSION 3
+#define ACM_CHWALL_VERSION 1
+#define ACM_STE_VERSION 1
+
+/* defines a ssid reference used by xen */
+typedef uint32_t ssidref_t;
+
+/* hooks that are known to domains */
+#define ACMHOOK_none 0
+#define ACMHOOK_sharing 1
+#define ACMHOOK_authorization 2
+
+/* -------security policy relevant type definitions-------- */
+
+/* type identifier; compares to "equal" or "not equal" */
+typedef uint16_t domaintype_t;
+
+/* CHINESE WALL POLICY DATA STRUCTURES
+ *
+ * current accumulated conflict type set:
+ * When a domain is started and has a type that is in
+ * a conflict set, the conflicting types are incremented in
+ * the aggregate set. When a domain is destroyed, the
+ * conflicting types to its type are decremented.
+ * If a domain has multiple types, this procedure works over
+ * all those types.
+ *
+ * conflict_aggregate_set[i] holds the number of
+ * running domains that have a conflict with type i.
+ *
+ * running_types[i] holds the number of running domains
+ * that include type i in their ssidref-referenced type set
+ *
+ * conflict_sets[i][j] is "0" if type j has no conflict
+ * with type i and is "1" otherwise.
+ */
+/* high-16 = version, low-16 = check magic */
+#define ACM_MAGIC 0x0001debc
+
+/* each offset in bytes from start of the struct they
+ * are part of */
+
+/* V3 of the policy buffer aded a version structure */
+struct acm_policy_version
+{
+ uint32_t major;
+ uint32_t minor;
+};
+
+
+/* each buffer consists of all policy information for
+ * the respective policy given in the policy code
+ *
+ * acm_policy_buffer, acm_chwall_policy_buffer,
+ * and acm_ste_policy_buffer need to stay 32-bit aligned
+ * because we create binary policies also with external
+ * tools that assume packed representations (e.g. the java tool)
+ */
+struct acm_policy_buffer {
+ uint32_t magic;
+ uint32_t policy_version; /* ACM_POLICY_VERSION */
+ uint32_t len;
+ uint32_t policy_reference_offset;
+ uint32_t primary_policy_code;
+ uint32_t primary_buffer_offset;
+ uint32_t secondary_policy_code;
+ uint32_t secondary_buffer_offset;
+ struct acm_policy_version xml_pol_version; /* add in V3 */
+};
+
+
+struct acm_policy_reference_buffer {
+ uint32_t len;
+};
+
+struct acm_chwall_policy_buffer {
+ uint32_t policy_version; /* ACM_CHWALL_VERSION */
+ uint32_t policy_code;
+ uint32_t chwall_max_types;
+ uint32_t chwall_max_ssidrefs;
+ uint32_t chwall_max_conflictsets;
+ uint32_t chwall_ssid_offset;
+ uint32_t chwall_conflict_sets_offset;
+ uint32_t chwall_running_types_offset;
+ uint32_t chwall_conflict_aggregate_offset;
+};
+
+struct acm_ste_policy_buffer {
+ uint32_t policy_version; /* ACM_STE_VERSION */
+ uint32_t policy_code;
+ uint32_t ste_max_types;
+ uint32_t ste_max_ssidrefs;
+ uint32_t ste_ssid_offset;
+};
+
+struct acm_stats_buffer {
+ uint32_t magic;
+ uint32_t len;
+ uint32_t primary_policy_code;
+ uint32_t primary_stats_offset;
+ uint32_t secondary_policy_code;
+ uint32_t secondary_stats_offset;
+};
+
+struct acm_ste_stats_buffer {
+ uint32_t ec_eval_count;
+ uint32_t gt_eval_count;
+ uint32_t ec_denied_count;
+ uint32_t gt_denied_count;
+ uint32_t ec_cachehit_count;
+ uint32_t gt_cachehit_count;
+};
+
+struct acm_ssid_buffer {
+ uint32_t len;
+ ssidref_t ssidref;
+ uint32_t policy_reference_offset;
+ uint32_t primary_policy_code;
+ uint32_t primary_max_types;
+ uint32_t primary_types_offset;
+ uint32_t secondary_policy_code;
+ uint32_t secondary_max_types;
+ uint32_t secondary_types_offset;
+};
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * acm_ops.h: Xen access control module hypervisor commands
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005,2006 International Business Machines Corporation.
+ */
+
+#ifndef __XEN_PUBLIC_ACM_OPS_H__
+#define __XEN_PUBLIC_ACM_OPS_H__
+
+#include "../xen.h"
+#include "acm.h"
+
+/*
+ * Make sure you increment the interface version whenever you modify this file!
+ * This makes sure that old versions of acm tools will stop working in a
+ * well-defined way (rather than crashing the machine, for instance).
+ */
+#define ACM_INTERFACE_VERSION 0xAAAA000A
+
+/************************************************************************/
+
+/*
+ * Prototype for this hypercall is:
+ * int acm_op(int cmd, void *args)
+ * @cmd == ACMOP_??? (access control module operation).
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+
+#define ACMOP_setpolicy 1
+struct acm_setpolicy {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) pushcache;
+ uint32_t pushcache_size;
+};
+
+
+#define ACMOP_getpolicy 2
+struct acm_getpolicy {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) pullcache;
+ uint32_t pullcache_size;
+};
+
+
+#define ACMOP_dumpstats 3
+struct acm_dumpstats {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) pullcache;
+ uint32_t pullcache_size;
+};
+
+
+#define ACMOP_getssid 4
+#define ACM_GETBY_ssidref 1
+#define ACM_GETBY_domainid 2
+struct acm_getssid {
+ /* IN */
+ uint32_t get_ssid_by; /* ACM_GETBY_* */
+ union {
+ domaintype_t domainid;
+ ssidref_t ssidref;
+ } id;
+ XEN_GUEST_HANDLE_64(void) ssidbuf;
+ uint32_t ssidbuf_size;
+};
+
+#define ACMOP_getdecision 5
+struct acm_getdecision {
+ /* IN */
+ uint32_t get_decision_by1; /* ACM_GETBY_* */
+ uint32_t get_decision_by2; /* ACM_GETBY_* */
+ union {
+ domaintype_t domainid;
+ ssidref_t ssidref;
+ } id1;
+ union {
+ domaintype_t domainid;
+ ssidref_t ssidref;
+ } id2;
+ uint32_t hook;
+ /* OUT */
+ uint32_t acm_decision;
+};
+
+
+#define ACMOP_chgpolicy 6
+struct acm_change_policy {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) policy_pushcache;
+ uint32_t policy_pushcache_size;
+ XEN_GUEST_HANDLE_64(void) del_array;
+ uint32_t delarray_size;
+ XEN_GUEST_HANDLE_64(void) chg_array;
+ uint32_t chgarray_size;
+ /* OUT */
+ /* array with error code */
+ XEN_GUEST_HANDLE_64(void) err_array;
+ uint32_t errarray_size;
+};
+
+#define ACMOP_relabeldoms 7
+struct acm_relabel_doms {
+ /* IN */
+ XEN_GUEST_HANDLE_64(void) relabel_map;
+ uint32_t relabel_map_size;
+ /* OUT */
+ XEN_GUEST_HANDLE_64(void) err_array;
+ uint32_t errarray_size;
+};
+
+/* future interface to Xen */
+struct xen_acmctl {
+ uint32_t cmd;
+ uint32_t interface_version;
+ union {
+ struct acm_setpolicy setpolicy;
+ struct acm_getpolicy getpolicy;
+ struct acm_dumpstats dumpstats;
+ struct acm_getssid getssid;
+ struct acm_getdecision getdecision;
+ struct acm_change_policy change_policy;
+ struct acm_relabel_doms relabel_doms;
+ } u;
+};
+
+typedef struct xen_acmctl xen_acmctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_acmctl_t);
+
+#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * This file contains the flask_op hypercall commands and definitions.
+ *
+ * Author: George Coker, <gscoker@alpha.ncsc.mil>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __FLASK_OP_H__
+#define __FLASK_OP_H__
+
+#define FLASK_LOAD 1
+#define FLASK_GETENFORCE 2
+#define FLASK_SETENFORCE 3
+#define FLASK_CONTEXT_TO_SID 4
+#define FLASK_SID_TO_CONTEXT 5
+#define FLASK_ACCESS 6
+#define FLASK_CREATE 7
+#define FLASK_RELABEL 8
+#define FLASK_USER 9
+#define FLASK_POLICYVERS 10
+#define FLASK_GETBOOL 11
+#define FLASK_SETBOOL 12
+#define FLASK_COMMITBOOLS 13
+#define FLASK_MLS 14
+#define FLASK_DISABLE 15
+#define FLASK_GETAVC_THRESHOLD 16
+#define FLASK_SETAVC_THRESHOLD 17
+#define FLASK_AVC_HASHSTATS 18
+#define FLASK_AVC_CACHESTATS 19
+#define FLASK_MEMBER 20
+
+typedef struct flask_op {
+ int cmd;
+ int size;
+ char *buf;
+} flask_op_t;
+
+DEFINE_XEN_GUEST_HANDLE(flask_op_t);
+
+#endif
--- /dev/null
+/*
+ * PCI Frontend - arch-dependendent declarations
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_ASM_PCIFRONT_H__
+#define __XEN_ASM_PCIFRONT_H__
+
+#include <linux/spinlock.h>
+
+#ifdef __KERNEL__
+
+#ifndef __ia64__
+
+#include <asm/pci.h>
+
+struct pcifront_device;
+struct pci_bus;
+#define pcifront_sd pci_sysdata
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+ return sd->pdev;
+}
+
+static inline void pcifront_init_sd(struct pcifront_sd *sd,
+ unsigned int domain, unsigned int bus,
+ struct pcifront_device *pdev)
+{
+ sd->domain = domain;
+ sd->pdev = pdev;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+ struct pcifront_sd *sd)
+{
+}
+
+#else /* __ia64__ */
+
+#include <linux/acpi.h>
+#include <asm/pci.h>
+#define pcifront_sd pci_controller
+
+extern void xen_add_resource(struct pci_controller *, unsigned int,
+ unsigned int, struct acpi_resource *);
+extern void xen_pcibios_setup_root_windows(struct pci_bus *,
+ struct pci_controller *);
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+ return (struct pcifront_device *)sd->platform_data;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+ struct pcifront_sd *sd)
+{
+ xen_pcibios_setup_root_windows(bus, sd);
+}
+
+#endif /* __ia64__ */
+
+extern struct rw_semaphore pci_bus_sem;
+
+#endif /* __KERNEL__ */
+
+#endif /* __XEN_ASM_PCIFRONT_H__ */
--- /dev/null
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/evtchn.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_EVTCHN_H__
+#define __LINUX_PUBLIC_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ \
+ _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
+struct ioctl_evtchn_bind_virq {
+ unsigned int virq;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
+ _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
+struct ioctl_evtchn_bind_interdomain {
+ unsigned int remote_domain, remote_port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
+ _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
+struct ioctl_evtchn_bind_unbound_port {
+ unsigned int remote_domain;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND \
+ _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
+struct ioctl_evtchn_unbind {
+ unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY \
+ _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
+struct ioctl_evtchn_notify {
+ unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET \
+ _IOC(_IOC_NONE, 'E', 5, 0)
+
+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
--- /dev/null
+/******************************************************************************
+ * gntdev.h
+ *
+ * Interface to /dev/xen/gntdev.
+ *
+ * Copyright (c) 2007, D G Murray
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+
+struct ioctl_gntdev_grant_ref {
+ /* The domain ID of the grant to be mapped. */
+ uint32_t domid;
+ /* The grant reference of the grant to be mapped. */
+ uint32_t ref;
+};
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+ /* IN parameters */
+ /* The number of grants to be mapped. */
+ uint32_t count;
+ uint32_t pad;
+ /* OUT parameters */
+ /* The offset to be used on a subsequent call to mmap(). */
+ uint64_t index;
+ /* Variable IN parameter. */
+ /* Array of grant references, of size @count. */
+ struct ioctl_gntdev_grant_ref refs[1];
+};
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+ /* IN parameters */
+ /* The offset was returned by the corresponding map operation. */
+ uint64_t index;
+ /* The number of pages to be unmapped. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ * supplied @vaddr must correspond to the start of the range; otherwise
+ * an error will result. It is only possible to munmap() the entire
+ * contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+ /* IN parameters */
+ /* The virtual address of the first mapped page in a range. */
+ uint64_t vaddr;
+ /* OUT parameters */
+ /* The offset that was used in the initial mmap() operation. */
+ uint64_t offset;
+ /* The number of pages mapped in the VM area that begins at @vaddr. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+ /* IN parameter */
+ /* The maximum number of grants that may be mapped at once. */
+ uint32_t count;
+};
+
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
--- /dev/null
+/******************************************************************************
+ * privcmd.h
+ *
+ * Interface to /proc/xen/privcmd.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_PRIVCMD_H__
+#define __LINUX_PUBLIC_PRIVCMD_H__
+
+#include <linux/types.h>
+
+#ifndef __user
+#define __user
+#endif
+
+typedef struct privcmd_hypercall
+{
+ __u64 op;
+ __u64 arg[5];
+} privcmd_hypercall_t;
+
+typedef struct privcmd_mmap_entry {
+ __u64 va;
+ __u64 mfn;
+ __u64 npages;
+} privcmd_mmap_entry_t;
+
+typedef struct privcmd_mmap {
+ int num;
+ domid_t dom; /* target domain */
+ privcmd_mmap_entry_t __user *entry;
+} privcmd_mmap_t;
+
+typedef struct privcmd_mmapbatch {
+ int num; /* number of pages to populate */
+ domid_t dom; /* target domain */
+ __u64 addr; /* virtual address */
+ xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
+} privcmd_mmapbatch_t;
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL \
+ _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
+#define IOCTL_PRIVCMD_MMAP \
+ _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
+#define IOCTL_PRIVCMD_MMAPBATCH \
+ _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
+
+#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
--- /dev/null
+#ifndef _XEN_SYSCTL_H
+#define _XEN_SYSCTL_H
+
+/* CTL_XEN names: */
+enum
+{
+ CTL_XEN_INDEPENDENT_WALLCLOCK=1,
+ CTL_XEN_PERMITTED_CLOCK_JITTER=2,
+};
+
+#endif /* _XEN_SYSCTL_H */
--- /dev/null
+
+#ifndef __ASM_XEN_PROC_H__
+#define __ASM_XEN_PROC_H__
+
+#include <linux/proc_fs.h>
+
+extern struct proc_dir_entry *create_xen_proc_entry(
+ const char *name, mode_t mode);
+extern void remove_xen_proc_entry(
+ const char *name);
+
+#endif /* __ASM_XEN_PROC_H__ */
#include <linux/mutex.h>
#include <linux/completion.h>
#include <linux/init.h>
+#include <linux/err.h>
#include <xen/interface/xen.h>
#include <xen/interface/grant_table.h>
#include <xen/interface/io/xenbus.h>
/* Callback (executed in a process context with no locks held). */
void (*callback)(struct xenbus_watch *,
const char **vec, unsigned int len);
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+ /* See XBWF_ definitions below. */
+ unsigned long flags;
+#endif
};
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/*
+ * Execute callback in its own kthread. Useful if the callback is long
+ * running or heavily serialised, to avoid taking out the main xenwatch thread
+ * for a long period of time (or even unwittingly causing a deadlock).
+ */
+#define XBWF_new_thread 1
+#endif
/* A xenbus device. */
struct xenbus_device {
/* A xenbus driver. */
struct xenbus_driver {
- char *name;
- struct module *owner;
+ const char *name;
const struct xenbus_device_id *ids;
int (*probe)(struct xenbus_device *dev,
const struct xenbus_device_id *id);
int (*suspend)(struct xenbus_device *dev);
int (*suspend_cancel)(struct xenbus_device *dev);
int (*resume)(struct xenbus_device *dev);
- int (*uevent)(struct xenbus_device *, char **, int, char *, int);
+ int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
struct device_driver driver;
int (*read_otherend_details)(struct xenbus_device *dev);
int (*is_ready)(struct xenbus_device *dev);
static inline int __must_check
xenbus_register_frontend(struct xenbus_driver *drv)
{
- WARN_ON(drv->owner != THIS_MODULE);
return __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME);
}
static inline int __must_check
xenbus_register_backend(struct xenbus_driver *drv)
{
- WARN_ON(drv->owner != THIS_MODULE);
return __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME);
}
/* Nil transaction ID. */
#define XBT_NIL ((struct xenbus_transaction) { 0 })
-int __init xenbus_dev_init(void);
-
char **xenbus_directory(struct xenbus_transaction t,
const char *dir, const char *node, unsigned int *num);
void *xenbus_read(struct xenbus_transaction t,
int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
/* notifer routines for when the xenstore comes up */
-extern int xenstored_ready;
int register_xenstore_notifier(struct notifier_block *nb);
void unregister_xenstore_notifier(struct notifier_block *nb);
/* Used by xenbus_dev to borrow kernel's store connection. */
void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
-struct work_struct;
-
/* Prepare for domain suspend: then resume or cancel the suspend. */
void xenbus_suspend(void);
void xenbus_resume(void);
-void xenbus_probe(struct work_struct *);
void xenbus_suspend_cancel(void);
#define XENBUS_IS_ERR_READ(str) ({ \
#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
+
+/**
+ * Register a watch on the given path, using the given xenbus_watch structure
+ * for storage, and the given callback function as the callback. Return 0 on
+ * success, or -errno on error. On success, the given path will be saved as
+ * watch->node, and remains the caller's to free. On error, watch->node will
+ * be NULL, the device will switch to XenbusStateClosing, and the error will
+ * be saved in the store.
+ */
int xenbus_watch_path(struct xenbus_device *dev, const char *path,
struct xenbus_watch *watch,
void (*callback)(struct xenbus_watch *,
const char **, unsigned int));
+
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/**
+ * Register a watch on the given path/path2, using the given xenbus_watch
+ * structure for storage, and the given callback function as the callback.
+ * Return 0 on success, or -errno on error. On success, the watched path
+ * (path/path2) will be saved as watch->node, and becomes the caller's to
+ * kfree(). On error, watch->node will be NULL, so the caller has nothing to
+ * free, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+ const char *path2, struct xenbus_watch *watch,
+ void (*callback)(struct xenbus_watch *,
+ const char **, unsigned int));
+#else
int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
void (*callback)(struct xenbus_watch *,
const char **, unsigned int),
const char *pathfmt, ...)
__attribute__ ((format (printf, 4, 5)));
+#endif
+/**
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error. On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
+
+
+/**
+ * Grant access to the given ring_mfn to the peer of the given device. Return
+ * 0 on success, or -errno on error. On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
-int xenbus_map_ring_valloc(struct xenbus_device *dev,
- int gnt_ref, void **vaddr);
+
+
+/**
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
+ * page to that address, and sets *vaddr to that address.
+ * xenbus_map_ring does not allocate the virtual address space (you must do
+ * this yourself!). It only maps in the page to the specified address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
+ int gnt_ref);
int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
grant_handle_t *handle, void *vaddr);
-int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
+
+/**
+ * Unmap a page of memory in this domain that was imported from another domain.
+ * Use xenbus_unmap_ring_vfree if you mapped in your memory with
+ * xenbus_map_ring_valloc (it will free the virtual address space).
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
int xenbus_unmap_ring(struct xenbus_device *dev,
grant_handle_t handle, void *vaddr);
+
+/**
+ * Allocate an event channel for the given xenbus_device, assigning the newly
+ * created local port to *port. Return 0 on success, or -errno on error. On
+ * error, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
-int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
+
+
+/**
+ * Free an existing event channel. Returns 0 on success or -errno on error.
+ */
int xenbus_free_evtchn(struct xenbus_device *dev, int port);
+
+/**
+ * Return the state of the driver rooted at the given store path, or
+ * XenbusStateUnknown if no state can be read.
+ */
enum xenbus_state xenbus_read_driver_state(const char *path);
-void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
-void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
+
+/***
+ * Report the given negative errno into the store, along with the given
+ * formatted message.
+ */
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
+ ...);
+
+
+/***
+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
+ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * closedown of this driver and its peer.
+ */
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
+ ...);
+
+int xenbus_dev_init(void);
const char *xenbus_strstate(enum xenbus_state state);
int xenbus_dev_is_online(struct xenbus_device *dev);
int xenbus_frontend_closed(struct xenbus_device *dev);
+int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *));
+int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *));
+
#endif /* _XEN_XENBUS_H */
--- /dev/null
+#ifndef __ASM_XENCONS_H__
+#define __ASM_XENCONS_H__
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+struct dom0_vga_console_info;
+void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
+#else
+#define dom0_init_screen_info(info) ((void)(info))
+#endif
+
+#ifdef CONFIG_XEN_CONSOLE
+void xencons_force_flush(void);
+void xencons_resume(void);
+
+/* Interrupt work hooks. Receive data, or kick data out. */
+void xencons_rx(char *buf, unsigned len);
+void xencons_tx(void);
+
+int xencons_ring_init(void);
+int xencons_ring_send(const char *data, unsigned len);
+#else
+static inline void xencons_force_flush(void) {}
+static inline void xencons_resume(void) {}
+#endif
+
+#endif /* __ASM_XENCONS_H__ */
--- /dev/null
+/******************************************************************************
+ * xen/xenoprof.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef __XEN_XENOPROF_H__
+#define __XEN_XENOPROF_H__
+#ifdef CONFIG_XEN
+
+#include <asm/xenoprof.h>
+
+struct oprofile_operations;
+int xenoprofile_init(struct oprofile_operations * ops);
+void xenoprofile_exit(void);
+
+struct xenoprof_shared_buffer {
+ char *buffer;
+ struct xenoprof_arch_shared_buffer arch;
+};
+#else
+#define xenoprofile_init(ops) (-ENOSYS)
+#define xenoprofile_exit() do { } while (0)
+
+#endif /* CONFIG_XEN */
+#endif /* __XEN_XENOPROF_H__ */
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
+ depends on !XEN
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
*/
if (time_after(jiffies, desc->last_unhandled + HZ/10))
desc->irqs_unhandled = 1;
- else
+ else if (!irq_ignore_unhandled(irq))
desc->irqs_unhandled++;
desc->last_unhandled = jiffies;
if (unlikely(action_ret != IRQ_NONE))
return 0;
}
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit)
{
struct page *pages;
pages = alloc_pages(gfp_mask, order);
if (pages) {
unsigned int count, i;
+#ifdef CONFIG_XEN
+ int address_bits;
+
+ if (limit == ~0UL)
+ address_bits = BITS_PER_LONG;
+ else
+ address_bits = ilog2(limit);
+
+ if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) {
+ __free_pages(pages, order);
+ return NULL;
+ }
+#endif
pages->mapping = NULL;
set_page_private(pages, order);
count = 1 << order;
count = 1 << order;
for (i = 0; i < count; i++)
ClearPageReserved(page + i);
+#ifdef CONFIG_XEN
+ xen_destroy_contiguous_region((unsigned long)page_address(page), order);
+#endif
__free_pages(page, order);
}
do {
unsigned long pfn, epfn, addr, eaddr;
- pages = kimage_alloc_pages(GFP_KERNEL, order);
+ pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT);
if (!pages)
break;
- pfn = page_to_pfn(pages);
+ pfn = kexec_page_to_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
eaddr = epfn << PAGE_SHIFT;
return pages;
}
+#ifndef CONFIG_XEN
static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
unsigned int order)
{
}
/* If I don't overlap any segments I have found my hole! */
if (i == image->nr_segments) {
- pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
break;
}
}
return pages;
}
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
return -ENOMEM;
ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
image->entry = ind_page;
image->last_entry = ind_page +
((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
#define for_each_kimage_entry(image, ptr, entry) \
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
ptr = (entry & IND_INDIRECTION)? \
- phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
static void kimage_free_entry(kimage_entry_t entry)
{
struct page *page;
- page = pfn_to_page(entry >> PAGE_SHIFT);
+ page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
kimage_free_pages(page);
}
if (!image)
return;
+#ifdef CONFIG_XEN
+ xen_machine_kexec_unload(image);
+#endif
+
kimage_free_extra_pages(image);
for_each_kimage_entry(image, ptr, entry) {
if (entry & IND_INDIRECTION) {
* have a match.
*/
list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
if (addr == destination) {
list_del(&page->lru);
return page;
kimage_entry_t *old;
/* Allocate a page, if we run out of memory give up */
- page = kimage_alloc_pages(gfp_mask, 0);
+ page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT);
if (!page)
return NULL;
/* If the page cannot be used file it away */
- if (page_to_pfn(page) >
+ if (kexec_page_to_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
list_add(&page->lru, &image->unuseable_pages);
continue;
}
- addr = page_to_pfn(page) << PAGE_SHIFT;
+ addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
/* If it is the destination page we want use it */
if (addr == destination)
struct page *old_page;
old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
copy_highpage(page, old_page);
*old = addr | (*old & ~PAGE_MASK);
result = -ENOMEM;
goto out;
}
- result = kimage_add_page(image, page_to_pfn(page)
+ result = kimage_add_page(image, kexec_page_to_pfn(page)
<< PAGE_SHIFT);
if (result < 0)
goto out;
return result;
}
+#ifndef CONFIG_XEN
static int kimage_load_crash_segment(struct kimage *image,
struct kexec_segment *segment)
{
char *ptr;
size_t uchunk, mchunk;
- page = pfn_to_page(maddr >> PAGE_SHIFT);
+ page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
if (!page) {
result = -ENOMEM;
goto out;
return result;
}
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ return kimage_load_normal_segment(image, segment);
+}
+#endif
/*
* Exec Kernel system call: for obvious reasons only root may call it.
if (result)
goto out;
}
+#ifdef CONFIG_XEN
+ if (image) {
+ result = xen_machine_kexec_load(image);
+ if (result)
+ goto out;
+ }
+#endif
/* Install the new kernel, and Uninstall the old */
image = xchg(dest_image, image);
module_init(crash_notes_memory_init)
+#ifndef CONFIG_XEN
/*
* parsing the "crashkernel" commandline
*
return 0;
}
-
+#endif
void crash_save_vmcoreinfo(void)
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#ifndef CONFIG_XEN
{
.procname = "nmi_watchdog",
.data = &nmi_watchdog_enabled,
.proc_handler = &proc_nmi_enabled,
},
#endif
+#endif
#if defined(CONFIG_X86)
{
.ctl_name = KERN_PANIC_ON_NMI,
.proc_handler = &proc_dointvec,
},
#endif
-#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
+#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP)
{
.procname = "acpi_video_flags",
.data = &acpi_realmode_flags,
#include <linux/sunrpc/debug.h>
#include <linux/string.h>
#include <net/ip_vs.h>
+#include <xen/sysctl.h>
struct trans_ctl_table {
int ctl_name;
{}
};
+#ifdef CONFIG_XEN
+static const struct trans_ctl_table trans_xen_table[] = {
+ { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" },
+ { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
+ {}
+};
+#endif
+
static const struct trans_ctl_table trans_arlan_conf_table0[] = {
{ 1, "spreadingCode" },
{ 2, "channelNumber" },
{ CTL_BUS, "bus", trans_bus_table },
{ CTL_ABI, "abi" },
/* CTL_CPU not used */
+#ifdef CONFIG_XEN
+ { CTL_XEN, "xen", trans_xen_table },
+#endif
{ CTL_ARLAN, "arlan", trans_arlan_table },
{ CTL_S390DBF, "s390dbf", trans_s390dbf_table },
{ CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
--- /dev/null
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is a fallback for platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
+ */
+
+#include <linux/cache.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/iommu-helper.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/pci.h>
+#include <asm/dma.h>
+#include <asm/uaccess.h>
+#include <xen/gnttab.h>
+#include <xen/interface/memory.h>
+#include <asm/gnttab_dma.h>
+
+int swiotlb;
+
+#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
+
+/*
+ * Maximum allowable number of contiguous slabs to map,
+ * must be a power of 2. What is the appropriate value ?
+ * The complexity of {map,unmap}_single is linearly dependent on this value.
+ */
+#define IO_TLB_SEGSIZE 128
+
+/*
+ * log of the size of each IO TLB slab. The number of slabs is command line
+ * controllable.
+ */
+#define IO_TLB_SHIFT 11
+
+int swiotlb_force;
+
+static char *iotlb_virt_start;
+static unsigned long iotlb_nslabs;
+
+/*
+ * Used to do a quick range check in swiotlb_unmap_single and
+ * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+static unsigned long iotlb_pfn_start, iotlb_pfn_end;
+
+/* Does the given dma address reside within the swiotlb aperture? */
+static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
+{
+ unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
+ return (pfn_valid(pfn)
+ && (pfn >= iotlb_pfn_start)
+ && (pfn < iotlb_pfn_end));
+}
+
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+static unsigned long io_tlb_overflow = 32*1024;
+
+void *io_tlb_overflow_buffer;
+
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+static struct phys_addr {
+ struct page *page;
+ unsigned int offset;
+} *io_tlb_orig_addr;
+
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+
+static unsigned int dma_bits;
+static unsigned int __initdata max_dma_bits = 32;
+static int __init
+setup_dma_bits(char *str)
+{
+ max_dma_bits = simple_strtoul(str, NULL, 0);
+ return 0;
+}
+__setup("dma_bits=", setup_dma_bits);
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+ /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
+ if (isdigit(*str)) {
+ iotlb_nslabs = simple_strtoul(str, &str, 0) <<
+ (20 - IO_TLB_SHIFT);
+ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
+ /* Round up to power of two (xen_create_contiguous_region). */
+ while (iotlb_nslabs & (iotlb_nslabs-1))
+ iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
+ }
+ if (*str == ',')
+ ++str;
+ /*
+ * NB. 'force' enables the swiotlb, but doesn't force its use for
+ * every DMA like it does on native Linux. 'off' forcibly disables
+ * use of the swiotlb.
+ */
+ if (!strcmp(str, "force"))
+ swiotlb_force = 1;
+ else if (!strcmp(str, "off"))
+ swiotlb_force = -1;
+ return 1;
+}
+__setup("swiotlb=", setup_io_tlb_npages);
+/* make io_tlb_overflow tunable too? */
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the PCI DMA API.
+ */
+void __init
+swiotlb_init_with_default_size(size_t default_size)
+{
+ unsigned long i, bytes;
+ int rc;
+
+ if (!iotlb_nslabs) {
+ iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
+ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
+ /* Round up to power of two (xen_create_contiguous_region). */
+ while (iotlb_nslabs & (iotlb_nslabs-1))
+ iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
+ }
+
+ bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
+
+ /*
+ * Get IO TLB memory from the low pages
+ */
+ iotlb_virt_start = alloc_bootmem_low_pages(bytes);
+ if (!iotlb_virt_start)
+ panic("Cannot allocate SWIOTLB buffer!\n");
+
+ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+ for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
+ do {
+ rc = xen_create_contiguous_region(
+ (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
+ get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
+ dma_bits);
+ } while (rc && dma_bits++ < max_dma_bits);
+ if (rc) {
+ if (i == 0)
+ panic("No suitable physical memory available for SWIOTLB buffer!\n"
+ "Use dom0_mem Xen boot parameter to reserve\n"
+ "some DMA memory (e.g., dom0_mem=-128M).\n");
+ iotlb_nslabs = i;
+ i <<= IO_TLB_SHIFT;
+ free_bootmem(__pa(iotlb_virt_start + i), bytes - i);
+ bytes = i;
+ for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
+ unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1));
+
+ if (bits > dma_bits)
+ dma_bits = bits;
+ }
+ break;
+ }
+ }
+
+ /*
+ * Allocate and initialize the free list array. This array is used
+ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
+ */
+ io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
+ for (i = 0; i < iotlb_nslabs; i++)
+ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_index = 0;
+ io_tlb_orig_addr = alloc_bootmem(
+ iotlb_nslabs * sizeof(*io_tlb_orig_addr));
+
+ /*
+ * Get the overflow emergency buffer
+ */
+ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+ if (!io_tlb_overflow_buffer)
+ panic("Cannot allocate SWIOTLB overflow buffer!\n");
+
+ do {
+ rc = xen_create_contiguous_region(
+ (unsigned long)io_tlb_overflow_buffer,
+ get_order(io_tlb_overflow),
+ dma_bits);
+ } while (rc && dma_bits++ < max_dma_bits);
+ if (rc)
+ panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
+
+ iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
+ iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Software IO TLB enabled: \n"
+ " Aperture: %lu megabytes\n"
+ " Kernel range: %p - %p\n"
+ " Address size: %u bits\n",
+ bytes >> 20,
+ iotlb_virt_start, iotlb_virt_start + bytes,
+ dma_bits);
+}
+
+void __init
+swiotlb_init(void)
+{
+ unsigned long ram_end;
+ size_t defsz = 64 << 20; /* 64MB default size */
+
+ if (swiotlb_force == 1) {
+ swiotlb = 1;
+ } else if ((swiotlb_force != -1) &&
+ is_running_on_xen() &&
+ is_initial_xendomain()) {
+ /* Domain 0 always has a swiotlb. */
+ ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+ if (ram_end <= 0x1ffff)
+ defsz = 2 << 20; /* 2MB on <512MB systems. */
+ else if (ram_end <= 0x3ffff)
+ defsz = 4 << 20; /* 4MB on <1GB systems. */
+ else if (ram_end <= 0x7ffff)
+ defsz = 8 << 20; /* 8MB on <2GB systems. */
+ swiotlb = 1;
+ }
+
+ if (swiotlb)
+ swiotlb_init_with_default_size(defsz);
+ else
+ printk(KERN_INFO "Software IO TLB disabled\n");
+}
+
+/*
+ * We use __copy_to_user_inatomic to transfer to the host buffer because the
+ * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
+ * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
+ * unnecessary copy from the aperture to the host buffer, and a page fault.
+ */
+static void
+__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
+{
+ if (PageHighMem(buffer.page)) {
+ size_t len, bytes;
+ char *dev, *host, *kmp;
+ len = size;
+ while (len != 0) {
+ unsigned long flags;
+
+ if (((bytes = len) + buffer.offset) > PAGE_SIZE)
+ bytes = PAGE_SIZE - buffer.offset;
+ local_irq_save(flags); /* protects KM_BOUNCE_READ */
+ kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ);
+ dev = dma_addr + size - len;
+ host = kmp + buffer.offset;
+ if (dir == DMA_FROM_DEVICE) {
+ if (__copy_to_user_inatomic(host, dev, bytes))
+ /* inaccessible */;
+ } else
+ memcpy(dev, host, bytes);
+ kunmap_atomic(kmp, KM_BOUNCE_READ);
+ local_irq_restore(flags);
+ len -= bytes;
+ buffer.page++;
+ buffer.offset = 0;
+ }
+ } else {
+ char *host = (char *)phys_to_virt(
+ page_to_pseudophys(buffer.page)) + buffer.offset;
+ if (dir == DMA_FROM_DEVICE) {
+ if (__copy_to_user_inatomic(host, dma_addr, size))
+ /* inaccessible */;
+ } else if (dir == DMA_TO_DEVICE)
+ memcpy(dma_addr, host, size);
+ }
+}
+
+/*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+static void *
+map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
+{
+ unsigned long flags;
+ char *dma_addr;
+ unsigned int nslots, stride, index, wrap;
+ struct phys_addr slot_buf;
+ int i;
+ unsigned long mask;
+ unsigned long offset_slots;
+ unsigned long max_slots;
+
+ mask = dma_get_seg_boundary(hwdev);
+ offset_slots = -IO_TLB_SEGSIZE;
+ max_slots = mask + 1
+ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+
+ /*
+ * For mappings greater than a page, we limit the stride (and
+ * hence alignment) to a page size.
+ */
+ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ if (size > PAGE_SIZE)
+ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+ else
+ stride = 1;
+
+ BUG_ON(!nslots);
+
+ /*
+ * Find suitable number of IO TLB entries size that will fit this
+ * request and allocate a buffer from that IO TLB pool.
+ */
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ index = ALIGN(io_tlb_index, stride);
+ if (index >= iotlb_nslabs)
+ index = 0;
+ wrap = index;
+
+ do {
+ while (iommu_is_span_boundary(index, nslots, offset_slots,
+ max_slots)) {
+ index += stride;
+ if (index >= iotlb_nslabs)
+ index = 0;
+ if (index == wrap)
+ goto not_found;
+ }
+
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot
+ * and mark the entries as '0' indicating unavailable.
+ */
+ if (io_tlb_list[index] >= nslots) {
+ int count = 0;
+
+ for (i = index; i < (int) (index + nslots); i++)
+ io_tlb_list[i] = 0;
+ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+ dma_addr = iotlb_virt_start + (index << IO_TLB_SHIFT);
+
+ /*
+ * Update the indices to avoid searching in the next
+ * round.
+ */
+ io_tlb_index = ((index + nslots) < iotlb_nslabs
+ ? (index + nslots) : 0);
+
+ goto found;
+ }
+ index += stride;
+ if (index >= iotlb_nslabs)
+ index = 0;
+ } while (index != wrap);
+
+not_found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+ return NULL;
+found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+ /*
+ * Save away the mapping from the original address to the DMA address.
+ * This is needed when we sync the memory. Then we sync the buffer if
+ * needed.
+ */
+ slot_buf = buffer;
+ for (i = 0; i < nslots; i++) {
+ slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
+ slot_buf.offset &= PAGE_SIZE - 1;
+ io_tlb_orig_addr[index+i] = slot_buf;
+ slot_buf.offset += 1 << IO_TLB_SHIFT;
+ }
+ if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
+ __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
+
+ return dma_addr;
+}
+
+static struct phys_addr dma_addr_to_phys_addr(char *dma_addr)
+{
+ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
+ struct phys_addr buffer = io_tlb_orig_addr[index];
+ buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
+ buffer.page += buffer.offset >> PAGE_SHIFT;
+ buffer.offset &= PAGE_SIZE - 1;
+ return buffer;
+}
+
+/*
+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ */
+static void
+unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+{
+ unsigned long flags;
+ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
+ struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
+
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
+ __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
+
+ /*
+ * Return the buffer to the free list by setting the corresponding
+ * entries to indicate the number of contigous entries available.
+ * While returning the entries to the free list, we merge the entries
+ * with slots below and above the pool being returned.
+ */
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ {
+ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+ io_tlb_list[index + nslots] : 0);
+ /*
+ * Step 1: return the slots to the free list, merging the
+ * slots with superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--)
+ io_tlb_list[i] = ++count;
+ /*
+ * Step 2: merge the returned slots with the preceding slots,
+ * if available (non zero)
+ */
+ for (i = index - 1;
+ (OFFSET(i, IO_TLB_SEGSIZE) !=
+ IO_TLB_SEGSIZE -1) && io_tlb_list[i];
+ i--)
+ io_tlb_list[i] = ++count;
+ }
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+
+static void
+sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+{
+ struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
+ BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
+ __sync_single(buffer, dma_addr, size, dir);
+}
+
+static void
+swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+{
+ /*
+ * Ran out of IOMMU space for this operation. This is very bad.
+ * Unfortunately the drivers cannot handle this operation properly.
+ * unless they check for pci_dma_mapping_error (most don't)
+ * When the mapping is small enough return a static buffer to limit
+ * the damage, or panic when the transfer is too big.
+ */
+ printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at "
+ "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
+
+ if (size > io_tlb_overflow && do_panic) {
+ if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic("PCI-DMA: Memory would be corrupted\n");
+ if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic("PCI-DMA: Random memory would be DMAed\n");
+ }
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode. The
+ * PCI address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ */
+static dma_addr_t
+_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
+ int dir, struct dma_attrs *attrs)
+{
+ struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+ dma_addr_t dev_addr = gnttab_dma_map_page(page) +
+ offset_in_page(paddr);
+ void *map;
+ struct phys_addr buffer;
+
+ BUG_ON(dir == DMA_NONE);
+
+ /*
+ * If the pointer passed in happens to be in the device's DMA window,
+ * we can safely return the device addr and not worry about bounce
+ * buffering it.
+ */
+ if (!range_straddles_page_boundary(paddr, size) &&
+ !address_needs_mapping(hwdev, dev_addr))
+ return dev_addr;
+
+ /*
+ * Oh well, have to allocate and map a bounce buffer.
+ */
+ gnttab_dma_unmap_page(dev_addr);
+ buffer.page = page;
+ buffer.offset = offset_in_page(paddr);
+ map = map_single(hwdev, buffer, size, dir);
+ if (!map) {
+ swiotlb_full(hwdev, size, dir, 1);
+ map = io_tlb_overflow_buffer;
+ }
+
+ dev_addr = virt_to_bus(map);
+ return dev_addr;
+}
+
+dma_addr_t
+swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
+ int dir, struct dma_attrs *attrs)
+{
+ return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs);
+}
+EXPORT_SYMBOL(swiotlb_map_single_attrs);
+
+dma_addr_t
+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
+{
+ return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
+}
+
+dma_addr_t
+swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
+{
+ return _swiotlb_map_single(hwdev, paddr, size, dir, NULL);
+}
+
+/*
+ * Unmap a single streaming mode DMA translation. The dma_addr and size must
+ * match what was provided for in a previous swiotlb_map_single call. All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+void
+swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir, struct dma_attrs *attrs)
+{
+ BUG_ON(dir == DMA_NONE);
+ if (in_swiotlb_aperture(dev_addr))
+ unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
+ else
+ gnttab_dma_unmap_page(dev_addr);
+}
+EXPORT_SYMBOL(swiotlb_unmap_single_attrs);
+
+void
+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
+ int dir)
+{
+ return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
+}
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
+ * call this function before doing so. At the next point you give the PCI dma
+ * address back to the card, you must first perform a
+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir)
+{
+ BUG_ON(dir == DMA_NONE);
+ if (in_swiotlb_aperture(dev_addr))
+ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
+}
+
+void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir)
+{
+ BUG_ON(dir == DMA_NONE);
+ if (in_swiotlb_aperture(dev_addr))
+ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
+}
+
+void
+swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ unsigned long offset, size_t size, int dir)
+{
+ BUG_ON(dir == DMA_NONE);
+ if (in_swiotlb_aperture(dev_addr))
+ sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
+}
+
+void
+swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
+ unsigned long offset, size_t size, int dir)
+{
+ BUG_ON(dir == DMA_NONE);
+ if (in_swiotlb_aperture(dev_addr))
+ sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
+}
+
+void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
+ struct dma_attrs *);
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above swiotlb_map_single
+ * interface. Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length. They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ * DMA address/length pairs than there are SG table elements.
+ * (for example via virtual mapping capabilities)
+ * The routine returns the number of addr/length pairs actually
+ * used, at most nents.
+ *
+ * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * same here.
+ */
+int
+swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ int dir, struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ struct phys_addr buffer;
+ dma_addr_t dev_addr;
+ char *map;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i) {
+ dev_addr = gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+
+ if (range_straddles_page_boundary(page_to_pseudophys(sg_page(sg))
+ + sg->offset, sg->length)
+ || address_needs_mapping(hwdev, dev_addr)) {
+ gnttab_dma_unmap_page(dev_addr);
+ buffer.page = sg_page(sg);
+ buffer.offset = sg->offset;
+ map = map_single(hwdev, buffer, sg->length, dir);
+ if (!map) {
+ /* Don't panic here, we expect map_sg users
+ to do proper error handling. */
+ swiotlb_full(hwdev, sg->length, dir, 0);
+ swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+ attrs);
+ sgl[0].dma_length = 0;
+ return 0;
+ }
+ sg->dma_address = virt_to_bus(map);
+ } else
+ sg->dma_address = dev_addr;
+ sg->dma_length = sg->length;
+ }
+ return nelems;
+}
+EXPORT_SYMBOL(swiotlb_map_sg_attrs);
+
+int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ int dir)
+{
+ return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+
+/*
+ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_single() above.
+ */
+void
+swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, int dir, struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i) {
+ if (in_swiotlb_aperture(sg->dma_address))
+ unmap_single(hwdev, bus_to_virt(sg->dma_address),
+ sg->dma_length, dir);
+ else
+ gnttab_dma_unmap_page(sg->dma_address);
+ }
+}
+EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
+
+void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ int dir)
+{
+ return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, int dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i) {
+ if (in_swiotlb_aperture(sg->dma_address))
+ sync_single(hwdev, bus_to_virt(sg->dma_address),
+ sg->dma_length, dir);
+ }
+}
+
+void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, int dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i) {
+ if (in_swiotlb_aperture(sg->dma_address))
+ sync_single(hwdev, bus_to_virt(sg->dma_address),
+ sg->dma_length, dir);
+ }
+}
+
+int
+swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+{
+ return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
+}
+
+/*
+ * Return whether the given PCI device DMA address mask can be supported
+ * properly. For example, if your device can only drive the low 24-bits
+ * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+swiotlb_dma_supported (struct device *hwdev, u64 mask)
+{
+ return (mask >= ((1UL << dma_bits) - 1));
+}
+
+EXPORT_SYMBOL(swiotlb_map_single);
+EXPORT_SYMBOL(swiotlb_unmap_single);
+EXPORT_SYMBOL(swiotlb_map_sg);
+EXPORT_SYMBOL(swiotlb_unmap_sg);
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
+EXPORT_SYMBOL(swiotlb_dma_supported);
}
}
+#ifndef CONFIG_XEN
VM_BUG_ON(!pfn_valid(pfn));
+#else
+#ifdef CONFIG_X86
+ /* XEN: Covers user-space grant mappings (even of local pages). */
+ if (unlikely(vma->vm_flags & VM_FOREIGN))
+ return NULL;
+#endif
+ if (unlikely(!pfn_valid(pfn))) {
+ VM_BUG_ON(!(vma->vm_flags & VM_RESERVED));
+ return NULL;
+ }
+#endif
/*
* NOTE! We still have PageReserved() pages in the page tables.
page->index > details->last_index))
continue;
}
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
+#ifdef CONFIG_XEN
+ if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
+ ptent = vma->vm_ops->zap_pte(vma, addr, pte,
+ tlb->fullmm);
+ else
+#endif
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
tlb_finish_mmu(tlb, address, end);
return end;
}
+EXPORT_SYMBOL(zap_page_range);
/*
* Do a quick page-table lookup for a single page.
continue;
}
+#ifdef CONFIG_XEN
+ if (vma && (vma->vm_flags & VM_FOREIGN)) {
+ struct page **map = vma->vm_private_data;
+ int offset = (start - vma->vm_start) >> PAGE_SHIFT;
+ if (map[offset] != NULL) {
+ if (pages) {
+ struct page *page = map[offset];
+
+ pages[i] = page;
+ get_page(page);
+ }
+ if (vmas)
+ vmas[i] = vma;
+ i++;
+ start += PAGE_SIZE;
+ len--;
+ continue;
+ }
+ }
+#endif
if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
|| !(vm_flags & vma->vm_flags))
return i ? : -EFAULT;
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
+ if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
+ continue;
change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
} while (pmd++, addr = next, addr != end);
}
int i;
int reserved = 0;
+#ifdef CONFIG_XEN
+ if (PageForeign(page)) {
+ PageForeignDestructor(page);
+ return;
+ }
+#endif
for (i = 0 ; i < (1 << order) ; ++i)
reserved += free_pages_check(page + i);
if (reserved)
struct per_cpu_pages *pcp;
unsigned long flags;
+#ifdef CONFIG_XEN
+ if (PageForeign(page)) {
+ PageForeignDestructor(page);
+ return;
+ }
+#endif
if (PageAnon(page))
page->mapping = NULL;
if (free_pages_check(page))
#include "net-sysfs.h"
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
+#include <net/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#endif
+
/*
* The list of packet types we will receive (as opposed to discard)
* and the routines to invoke.
return 0;
}
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
+inline int skb_checksum_setup(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ unsigned char *th;
+ int err = -EPROTO;
+
+#ifdef CONFIG_XEN
+ if (!skb->proto_csum_blank)
+ return 0;
+#endif
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto out;
+
+ iph = ip_hdr(skb);
+ th = skb_network_header(skb) + 4 * iph->ihl;
+ if (th >= skb_tail_pointer(skb))
+ goto out;
+
+ skb->csum_start = th - skb->head;
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ break;
+ case IPPROTO_UDP:
+ skb->csum_offset = offsetof(struct udphdr, check);
+ break;
+ default:
+ if (net_ratelimit())
+ printk(KERN_ERR "Attempting to checksum a non-"
+ "TCP/UDP packet, dropping a protocol"
+ " %d packet", iph->protocol);
+ goto out;
+ }
+
+ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
+ goto out;
+
+#ifdef CONFIG_XEN
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->proto_csum_blank = 0;
+#endif
+
+ err = 0;
+
+out:
+ return err;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+#endif
+
/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
struct Qdisc *q;
int rc = -ENOMEM;
+ /* If a checksum-deferred packet is forwarded to a device that needs a
+ * checksum, correct the pointers and force checksumming.
+ */
+ if (skb_checksum_setup(skb))
+ goto out_kfree_skb;
+
/* GSO will handle the following emulations directly. */
if (netif_needs_gso(dev, skb))
goto gso;
}
#endif
+#ifdef CONFIG_XEN
+ switch (skb->ip_summed) {
+ case CHECKSUM_UNNECESSARY:
+ skb->proto_data_valid = 1;
+ break;
+ case CHECKSUM_PARTIAL:
+ /* XXX Implement me. */
+ default:
+ skb->proto_data_valid = 0;
+ break;
+ }
+#endif
+
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
n->cloned = 1;
n->nohdr = 0;
+#ifdef CONFIG_XEN
+ C(proto_data_valid);
+ C(proto_csum_blank);
+#endif
n->destructor = NULL;
C(iif);
C(tail);
if (hdrsize < sizeof(*hdr))
return true;
+ if (skb_checksum_setup(skb))
+ return false;
+
inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
return true;
newport = tuple->dst.u.udp.port;
portptr = &hdr->dest;
}
+
+ if (skb_checksum_setup(skb))
+ return false;
+
if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
#endif
skb->protocol = htons(ETH_P_IP);
- return xfrm_output(skb);
+ return skb_checksum_setup(skb) ?: xfrm_output(skb);
}
int xfrm4_output(struct sk_buff *skb)
spin_lock_bh(&ifp->lock);
if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+ !(dev->flags&IFF_MULTICAST) ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC);
if (ifp->idev->cnf.forwarding == 0 &&
ifp->idev->cnf.rtr_solicits > 0 &&
(dev->flags&IFF_LOOPBACK) == 0 &&
+ (dev->flags & IFF_MULTICAST) &&
(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
/*
* If a host as already performed a random delay
$(warning kbuild: Makefile.build is included improperly)
endif
+ifeq ($(CONFIG_XEN),y)
+$(objtree)/scripts/Makefile.xen: $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build
+ @echo ' Updating $@'
+ $(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\
+ ,$(error 'Your awk program does not define gensub. Use gawk or another awk with gensub'))
+ @$(AWK) -f $< $(filter-out $<,$^) >$@
+
+xen-src-single-used-m := $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c))))
+xen-single-used-m := $(xen-src-single-used-m:-xen.c=.o)
+single-used-m := $(filter-out $(xen-single-used-m),$(single-used-m))
+
+-include $(objtree)/scripts/Makefile.xen
+endif
+
# ===========================================================================
ifneq ($(strip $(lib-y) $(lib-m) $(lib-n) $(lib-)),)
lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m)))
+# Remove objects forcibly disabled
+
+obj-y := $(filter-out $(disabled-obj-y),$(obj-y))
+obj-m := $(filter-out $(disabled-obj-y),$(obj-m))
+lib-y := $(filter-out $(disabled-obj-y),$(lib-y))
+
# Handle objects in subdirs
# ---------------------------------------------------------------------------
--- /dev/null
+BEGIN {
+ is_rule = 0
+}
+
+/^[[:space:]]*#/ {
+ next
+}
+
+/^[[:space:]]*$/ {
+ if (is_rule)
+ print("")
+ is_rule = 0
+ next
+}
+
+/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
+ line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
+ line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
+ print line
+ is_rule = 1
+ next
+}
+
+/^[^\t]$/ {
+ if (is_rule)
+ print("")
+ is_rule = 0
+ next
+}
+
+is_rule {
+ print $0
+ next
+}